Here’s a small shell script for doing the recrawl process in nutch. You might have to change certain lines because I did some customizations, but it should work for you too ![]()
recrawl.sh
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
#!/bin/sh
# Runs the Nutch bot to crawl or re-crawl
# Usage: bin/runbot [depth] [adddays] [safe]
# If executed in 'safe' mode, it doesn't delete the temporary
# directories generated during crawl. This might be helpful for
# analysis and recovery in case a crawl fails.
#
# Author: Susam Pal
if [ -z "$1" ]
then
depth=5
else
depth=$1
fi
threads=50
if [ -z "$2" ]
then
adddays=7
else
adddays=$2
fi
topN=10000 # Comment this statement if you don't want to set topN value
# Parse arguments
if [ "$3" == "safe" ]
then
safe=yes
fi
if [ -z "$NUTCH_HOME" ]
then
NUTCH_HOME=/home/nutch/
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script
else
echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
fi
if [ -z "$CATALINA_HOME" ]
then
CATALINA_HOME=/srv/www/tomcat5/base/
echo runbot: $0 could not find environment variable CATALINA_HOME
echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script
else
echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME
fi
if [ -n "$topN" ]
then
topN="--topN $rank"
else
topN=""
fi
steps=10
echo "----- Inject (Step 1 of $steps) -----"
$NUTCH_HOME/bin/nutch inject crawl/crawldb seed
echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for((i=0; i < $depth; i++))
do
echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
$NUTCH_HOME/bin/nutch generate crawl/crawldb crawl/segments $topN -adddays $adddays
if [ $? -ne 0 ]
then
echo "runbot: Stopping at depth $depth. No more URLs to fetch."
break
fi
segment=`ls -d crawl/segments/* | tail -1`
$NUTCH_HOME/bin/nutch fetch $segment -threads $threads
if [ $? -ne 0 ]
then
echo "runbot: fetch $segment at depth $depth failed. Deleting it."
rm -rf $segment
continue
fi
echo "--- Parsing Segment $segment ---"
$NUTCH_HOME/bin/nutch parse $segment
$NUTCH_HOME/bin/nutch updatedb crawl/crawldb $segment
done
echo "----- Stopping Tomcat (Step 3 of $steps) -----"
sudo /etc/init.d/tomcat5 stop
echo "----- Merge Segments (Step 4 of $steps) -----"
$NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*
if [ $? -eq 0 ]
then
if [ "$safe" != "yes" ]
then
rm -rf crawl/segments/*
else
mkdir crawl/FETCHEDsegments
mv --verbose crawl/segments/* crawl/FETCHEDsegments
fi
mv --verbose crawl/MERGEDsegments/* crawl/segments
rmdir crawl/MERGEDsegments
else
exit
fi
echo "----- Invert Links (Step 5 of $steps) -----"
$NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*
echo "----- Index (Step 6 of $steps) -----"
$NUTCH_HOME/bin/nutch index crawl/NEWindexes crawl/crawldb crawl/linkdb crawl/segments/*
echo "----- Dedup (Step 7 of $steps) -----"
$NUTCH_HOME/bin/nutch dedup crawl/NEWindexes
echo "----- Merge Indexes (Step 8 of $steps) -----"
$NUTCH_HOME/bin/nutch merge crawl/MERGEDindexes crawl/NEWindexes
# in nutch-site, hadoop.tmp.dir points to crawl/tmp
rm -rf crawl/tmp/*
# replace indexes with indexes_merged
mv --verbose crawl/index crawl/OLDindexes
mv --verbose crawl/MERGEDindexes crawl/index
# clean up old indexes directories
if [ "$safe" != "yes" ]
then
rm -rf crawl/NEWindexes
rm -rf crawl/OLDindexes
fi
echo "----- Reloading index on the search site (Step 9 of $steps) -----"
if [ "$safe" != "yes" ]
then
touch ${CATALINA_HOME}/webapps/ROOT/WEB-INF/web.xml
echo Done!
else
echo runbot: Can not reload index in safe mode.
echo runbot: Please reload it manually using the following command:
echo runbot: touch ${CATALINA_HOME}/webapps/ROOT/WEB-INF/web.xml
fi
echo "----- Restarting Tomcat (Step 10 of $steps) -----"
sudo /etc/init.d/tomcat5 stop
sudo /etc/init.d/tomcat5 start
echo "runbot: FINISHED: Crawl completed!" |
It looks like you run nutch parse, but your fetcher isn’t called with -noParsing, meaning that the call to parse isn’t needed as fetcher will parse by default.
Posted by Max | 3. August 2010, 21:14I want to use the automatic crawling in nutch-1.0 by using timer.
How can i use your recrawl script?
Please give me a few hint.
Posted by Aung Ko Win | 1. März 2011, 06:05