Note: The information contained in this post may be outdated!
Here’s a small shell script for doing the recrawl process in nutch. You might have to change certain lines because I did some customizations, but it should work for you too 🙂
recrawl.sh
#!/bin/sh
# Runs the Nutch bot to crawl or re-crawl
# Usage: bin/runbot [depth] [adddays] [safe]
# If executed in 'safe' mode, it doesn't delete the temporary
# directories generated during crawl. This might be helpful for
# analysis and recovery in case a crawl fails.
#
# Author: Susam Pal
if [ -z "$1" ]
then
depth=5
else
depth=$1
fi
threads=50
if [ -z "$2" ]
then
adddays=7
else
adddays=$2
fi
topN=10000 # Comment this statement if you don't want to set topN value
# Parse arguments
if [ "$3" == "safe" ]
then
safe=yes
fi
if [ -z "$NUTCH_HOME" ]
then
NUTCH_HOME=/home/nutch/
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script
else
echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
fi
if [ -z "$CATALINA_HOME" ]
then
CATALINA_HOME=/srv/www/tomcat5/base/
echo runbot: $0 could not find environment variable CATALINA_HOME
echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script
else
echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME
fi
if [ -n "$topN" ]
then
topN="--topN $rank"
else
topN=""
fi
steps=10
echo "----- Inject (Step 1 of $steps) -----"
$NUTCH_HOME/bin/nutch inject crawl/crawldb seed
echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for((i=0; i < $depth; i++))
do
echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
$NUTCH_HOME/bin/nutch generate crawl/crawldb crawl/segments $topN -adddays $adddays
if [ $? -ne 0 ]
then
echo "runbot: Stopping at depth $depth. No more URLs to fetch."
break
fi
segment=`ls -d crawl/segments/* | tail -1`
$NUTCH_HOME/bin/nutch fetch $segment -threads $threads
if [ $? -ne 0 ]
then
echo "runbot: fetch $segment at depth $depth failed. Deleting it."
rm -rf $segment
continue
fi
echo "--- Parsing Segment $segment ---"
$NUTCH_HOME/bin/nutch parse $segment
$NUTCH_HOME/bin/nutch updatedb crawl/crawldb $segment
done
echo "----- Stopping Tomcat (Step 3 of $steps) -----"
sudo /etc/init.d/tomcat5 stop
echo "----- Merge Segments (Step 4 of $steps) -----"
$NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*
if [ $? -eq 0 ]
then
if [ "$safe" != "yes" ]
then
rm -rf crawl/segments/*
else
mkdir crawl/FETCHEDsegments
mv --verbose crawl/segments/* crawl/FETCHEDsegments
fi
mv --verbose crawl/MERGEDsegments/* crawl/segments
rmdir crawl/MERGEDsegments
else
exit
fi
echo "----- Invert Links (Step 5 of $steps) -----"
$NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*
echo "----- Index (Step 6 of $steps) -----"
$NUTCH_HOME/bin/nutch index crawl/NEWindexes crawl/crawldb crawl/linkdb crawl/segments/*
echo "----- Dedup (Step 7 of $steps) -----"
$NUTCH_HOME/bin/nutch dedup crawl/NEWindexes
echo "----- Merge Indexes (Step 8 of $steps) -----"
$NUTCH_HOME/bin/nutch merge crawl/MERGEDindexes crawl/NEWindexes
# in nutch-site, hadoop.tmp.dir points to crawl/tmp
rm -rf crawl/tmp/*
# replace indexes with indexes_merged
mv --verbose crawl/index crawl/OLDindexes
mv --verbose crawl/MERGEDindexes crawl/index
# clean up old indexes directories
if [ "$safe" != "yes" ]
then
rm -rf crawl/NEWindexes
rm -rf crawl/OLDindexes
fi
echo "----- Reloading index on the search site (Step 9 of $steps) -----"
if [ "$safe" != "yes" ]
then
touch ${CATALINA_HOME}/webapps/ROOT/WEB-INF/web.xml
echo Done!
else
echo runbot: Can not reload index in safe mode.
echo runbot: Please reload it manually using the following command:
echo runbot: touch ${CATALINA_HOME}/webapps/ROOT/WEB-INF/web.xml
fi
echo "----- Restarting Tomcat (Step 10 of $steps) -----"
sudo /etc/init.d/tomcat5 stop
sudo /etc/init.d/tomcat5 start
echo "runbot: FINISHED: Crawl completed!"
It looks like you run nutch parse, but your fetcher isn’t called with -noParsing, meaning that the call to parse isn’t needed as fetcher will parse by default.
I want to use the automatic crawling in nutch-1.0 by using timer.
How can i use your recrawl script?
Please give me a few hint.