//
you're reading...

Java

Recrawl script for nutch

Here’s a small shell script for doing the recrawl process in nutch. You might have to change certain lines because I did some customizations, but it should work for you too :)

recrawl.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/bin/sh
# Runs the Nutch bot to crawl or re-crawl
# Usage: bin/runbot [depth] [adddays] [safe]
# If executed in 'safe' mode, it doesn't delete the temporary
# directories generated during crawl. This might be helpful for
# analysis and recovery in case a crawl fails.
#
# Author: Susam Pal
if [ -z "$1" ]
then
    depth=5
else
    depth=$1
fi
threads=50
if [ -z "$2" ]
then
    adddays=7
else
    adddays=$2
fi
topN=10000 # Comment this statement if you don't want to set topN value
# Parse arguments
if [ "$3" == "safe" ]
then
safe=yes
fi
if [ -z "$NUTCH_HOME" ]
then
NUTCH_HOME=/home/nutch/
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script
else
echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
fi
if [ -z "$CATALINA_HOME" ]
then
CATALINA_HOME=/srv/www/tomcat5/base/
echo runbot: $0 could not find environment variable CATALINA_HOME
echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script
else
echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME
fi
if [ -n "$topN" ]
then
topN="--topN $rank"
else
topN=""
fi
steps=10
echo "----- Inject (Step 1 of $steps) -----"
$NUTCH_HOME/bin/nutch inject crawl/crawldb seed
echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for((i=0; i < $depth; i++))
do
echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
$NUTCH_HOME/bin/nutch generate crawl/crawldb crawl/segments $topN -adddays $adddays
if [ $? -ne 0 ]
then
echo "runbot: Stopping at depth $depth. No more URLs to fetch."
break
fi
segment=`ls -d crawl/segments/* | tail -1`
$NUTCH_HOME/bin/nutch fetch $segment -threads $threads
if [ $? -ne 0 ]
then
echo "runbot: fetch $segment at depth $depth failed. Deleting it."
rm -rf $segment
continue
fi
echo "--- Parsing Segment $segment ---"
$NUTCH_HOME/bin/nutch parse $segment
$NUTCH_HOME/bin/nutch updatedb crawl/crawldb $segment
done
echo "----- Stopping Tomcat (Step 3 of $steps) -----"
sudo /etc/init.d/tomcat5 stop
echo "----- Merge Segments (Step 4 of $steps) -----"
$NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*
if [ $? -eq 0 ]
then
if [ "$safe" != "yes" ]
then
rm -rf crawl/segments/*
else
mkdir crawl/FETCHEDsegments
mv --verbose crawl/segments/* crawl/FETCHEDsegments
fi
mv --verbose crawl/MERGEDsegments/* crawl/segments
rmdir crawl/MERGEDsegments
else
exit
fi
echo "----- Invert Links (Step 5 of $steps) -----"
$NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*
echo "----- Index (Step 6 of $steps) -----"
$NUTCH_HOME/bin/nutch index crawl/NEWindexes crawl/crawldb crawl/linkdb crawl/segments/*
echo "----- Dedup (Step 7 of $steps) -----"
$NUTCH_HOME/bin/nutch dedup crawl/NEWindexes
echo "----- Merge Indexes (Step 8 of $steps) -----"
$NUTCH_HOME/bin/nutch merge crawl/MERGEDindexes crawl/NEWindexes
# in nutch-site, hadoop.tmp.dir points to crawl/tmp
rm -rf crawl/tmp/*
# replace indexes with indexes_merged
mv --verbose crawl/index crawl/OLDindexes
mv --verbose crawl/MERGEDindexes crawl/index
# clean up old indexes directories
if [ "$safe" != "yes" ]
then
rm -rf crawl/NEWindexes
rm -rf crawl/OLDindexes
fi
echo "----- Reloading index on the search site (Step 9 of $steps) -----"
if [ "$safe" != "yes" ]
then
touch ${CATALINA_HOME}/webapps/ROOT/WEB-INF/web.xml
echo Done!
else
echo runbot: Can not reload index in safe mode.
echo runbot: Please reload it manually using the following command:
echo runbot: touch ${CATALINA_HOME}/webapps/ROOT/WEB-INF/web.xml
fi
echo "----- Restarting Tomcat (Step 10 of $steps) -----"
sudo /etc/init.d/tomcat5 stop
sudo /etc/init.d/tomcat5 start
echo "runbot: FINISHED: Crawl completed!"

Discussion

2 Responses to “Recrawl script for nutch”

  1. It looks like you run nutch parse, but your fetcher isn’t called with -noParsing, meaning that the call to parse isn’t needed as fetcher will parse by default.

    Posted by Max | 3. August 2010, 21:14
  2. I want to use the automatic crawling in nutch-1.0 by using timer.
    How can i use your recrawl script?
    Please give me a few hint.

    Posted by Aung Ko Win | 1. März 2011, 06:05

Post a Comment