#! /usr/bin/env bash
# usage generate xmlpipe2 to be followed by an indexer pmwikis
# e.g. ./sphinx_sources > pmwiki.sphinxxml && iconv -f ISO-8859-1 -t utf-8 pmwiki.sphinxxml > pmwiki.sphinxxml.utf-8 && mv pmwiki.sphinxxml.utf-8 pmwiki.sphinxxml
# use /home/fabien/pmwiki.sphinxxml to be indexed
# see http://sphinxsearch.com/docs/manual-2.0.1.html#xmlpipe2
XMLHEADER='\n'
XMLFOOTER=''
XMLSCHEMA='\n\n\n\n\n\n\n\n\n\n\n\n\n'
KILLLIST='\n'
WIKIS=( "/path/to/wikis/devpim" "/path/to/wikis/wiki" "/path/to/wikis/mirrors/agiwiki" "/path/to/wikis/mirrors/fabien" "/path/to/wikis/mirrors/pim" "/path/to/wikis/mirrors/saint-maur" "/path/to/wikis/mirrors/wiki" )
LANG=ISO-8859-1
# assumed, should be using the charset of each file instead
IDXPMWIKI=/path/to/wikis/devpim/pub/sphinx_pmwikis_doc_ids.php
echo -e ' $IDXPMWIKI
echo -e "$XMLHEADER\n$XMLSCHEMA\n"
for W in "${WIKIS[@]}"
do
WIKIPATH=$W/wiki.d
for P in `ls $WIKIPATH -IPmWiki.* -I*RecentChanges -Itotalcounter.stat -I*,del-*`;
do
ID=`date +%N`
# %s%N and md5sum (not even numeric) are too large
echo ""
# 1 prependded to avoid type confusion
echo "\"1$ID\" => \"$WIKIPATH/$P\"," >> $IDXPMWIKI
# this is done manually since the data are not in the MySQL db to be queried
cat $WIKIPATH/$P | grep -v ^passwd | grep -v ^updatedto | grep -v ^text | grep -v ^diff: | grep -v ^csum: | grep -v ^author: | grep -v ^host: | grep -v ^newline= | grep -v ^title= | sed -f ~/bin/pmwiki-to-sphinxxml
cat $WIKIPATH/$P | grep ^text= | tr -s '[:print:]' | sed -f ~/bin/pmwiki-to-sphinxxml | tr -d "\a" | tr -d "\b" | tr -d '[:cntrl:]'
echo "";
done
done
#echo ');'>> $IDXPMWIKI;