| |
| 基于Spindle的增强HTTP Spider |
| |
发布者: 发布时间:2008-04-03 |
|
|
构建于lucene之上的可用的 Java开源Spider少之又少,spindle长期没有更新且功能不够完善,故而自己参考其源 代码重新编写了一个可扩展的WebCrawler,本着 开源共享,共同进步的想法发布于此,期冀得到大家的批评指正, 有任何意见及建议均可Email联系我  (kaninebruno@hotmail.com) 以下代码基于lucene-2.3.1,htmlparser-1.6,je-analysis-1.5.3,以及自己修改过的cpdetector-1.0.5; 下载地址分别为 htmlparser: http://sourceforge.net/project/showfiles.php?group_id=24399je-analysis: http://www.jesoft.cn/je-analysis-1.5.3.jarlucene就不用说了,cpdetector-1.0.5见附件. spindle的官方站点: http://www.bitmechanic.com/projects/spindle/- package com.huizhi.kanine.util;
-
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.UnsupportedEncodingException;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.SocketException;
- import java.net.SocketTimeoutException;
- import java.net.URL;
- import java.net.UnknownHostException;
- import java.nio.charset.Charset;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.HashSet;
-
- import jeasy.analysis.MMAnalyzer;
-
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.DateTools;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.Hits;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.store.RAMDirectory;
- import org.htmlparser.Parser;
- import org.htmlparser.PrototypicalNodeFactory;
- import org.htmlparser.filters.AndFilter;
- import org.htmlparser.filters.HasAttributeFilter;
- import org.htmlparser.filters.NodeClassFilter;
- import org.htmlparser.tags.BaseHrefTag;
- import org.htmlparser.tags.FrameTag;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.tags.MetaTag;
- import org.htmlparser.util.EncodingChangeException;
- import org.htmlparser.util.NodeIterator;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- import org.htmlparser.visitors.HtmlPage;
-
- import cpdetector.io.ASCIIDetector;
- import cpdetector.io.CodepageDetectorProxy;
- import cpdetector.io.JChardetFacade;
- import cpdetector.io.ParsingDetector;
- import cpdetector.io.UnicodeDetector;
-
-
-
-
-
-
-
- public class SiteCapturer implements Runnable{
-
-
- protected URL mSource;
-
-
- protected String mTarget;
-
-
-
-
-
- protected ArrayList mPages;
-
-
- protected HashSet mFinished;
-
- protected Parser mParser;
-
-
- protected final int TRANSFER_SIZE = 4096;
-
-
- protected static String lineSep = System.getProperty("line.separator");
-
-
- protected int mthreads;
-
- protected ArrayList threadList;
-
-
- protected IndexWriter FSDWriter;
-
-
- protected IndexWriter RAMWriter;
-
- protected IndexSearcher indexSearcher;
-
- protected RAMDirectory ramDirectory;
-
-
- protected Analyzer luceneAnalyzer;
-
-
- protected String charset;
-
-
- protected int count = 0;
-
-
- protected int mPort;
-
-
- protected String mHost;
-
-
- protected boolean mCheck;
-
-
- public static final Object indexLock = new Object();
-
- public SiteCapturer() {
- mSource = null;
- mTarget = null;
- mthreads = 2;
- mCheck = false;
- mPages = new ArrayList();
- mFinished = new HashSet();
- mParser = new Parser();
- PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
- factory.registerTag(new LocalLinkTag());
- factory.registerTag(new LocalFrameTag());
- factory.registerTag(new LocalBaseHrefTag());
- mParser.setNodeFactory(factory);
- }
-
- public String getSource() {
- return mSource.toString();
- }
-
- public void setSource(String source) {
- if (source.endsWith("/"))
- source = source.substring(0, source.length() - 1);
- try {
- mSource = new URL(source);
- } catch (MalformedURLException e) {
- System.err.println("Invalid URL : " + getSource());
- }
- }
-
- public String getTarget() {
- return (mTarget);
- }
-
- public void setTarget(String target) {
- mTarget = target;
- }
-
- public int getThreads() {
- return (mthreads);
- }
-
- public void setThreads(int threads) {
- mthreads = threads;
- }
-
- public boolean isMCheck() {
- return mCheck;
- }
-
- public void setMCheck(boolean check) {
- mCheck = check;
- }
-
-
-
-
-
-
- public void capture(){
-
- mPages.clear();
- mPages.add(getSource());
-
- int responseCode = 0;
- String contentType = "";
-
- try {
- HttpURLConnection uc = (HttpURLConnection) mSource.openConnection();
- responseCode = uc.getResponseCode();
- contentType = uc.getContentType();
- } catch (MalformedURLException mue) {
- System.err.println("Invalid URL : " + getSource());
- } catch (IOException ie) {
- if (ie instanceof UnknownHostException) {
- System.err.println("UnknowHost : " + getSource());
- } else if (ie instanceof SocketException) {
- System.err.println("Socket Error : " + ie.getMessage() + " "
- + getSource());
- } else
- ie.printStackTrace();
- }
-
- if (responseCode == HttpURLConnection.HTTP_OK
- && contentType.startsWith("text/html")) {
-
- mPort = mSource.getPort();
- mHost = mSource.getHost();
- charset = autoDetectCharset(mSource);
-
-
- File indexDir = new File(mTarget);
-
- boolean flag = true;
- if (!indexDir.exists()) {
-
- indexDir.mkdir();
- } else if (IndexReader.indexExists(mTarget)) {
-
- flag = false;
- File lockfile = new File(mTarget + File.separator + "write.lock");
- if (lockfile.exists())
- lockfile.delete();
- }
- luceneAnalyzer = new MMAnalyzer();
- ramDirectory = new RAMDirectory();
-
- try {
- FSDWriter = new IndexWriter(indexDir, luceneAnalyzer, flag);
- RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true);
-
- while (mCheck) {
- IndexReader indexReader = IndexReader.open(mTarget);
- indexSearcher = new IndexSearcher(indexReader);
- }
-
- long start = System.currentTimeMillis();
- threadList = new ArrayList();
-
- for (int i = 0; i < mthreads; i++) {
- Thread t = new Thread(this, "K-9 Spider Thread #" + (i + 1));
- t.start();
- threadList.add(t);
- }
- while (threadList.size() > 0) {
- Thread child = (Thread) threadList.remove(0);
- try {
- child.join();
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- long elapsed = System.currentTimeMillis() - start;
-
- RAMWriter.close();
- FSDWriter.addIndexes(new Directory[] { ramDirectory });
- FSDWriter.optimize();
- FSDWriter.close();
-
- System.out.println("Finished in " + (elapsed / 1000)
- + " seconds");
- System.out.println("The Count of the Links Captured is "
- + count);
- } catch (CorruptIndexException cie) {
- cie.printStackTrace();
- } catch (LockObtainFailedException lofe) {
- lofe.printStackTrace();
- } catch (IOException ie) {
- ie.printStackTrace();
- }
- }
- }
-
- public void run() {
- String url;
- while ((url = dequeueURL()) != null) {
- if (isToBeCaptured(url))
- process(url);
- }
- mthreads--;
- }
-
-
-
-
- public boolean isToBeCaptured (String url){
- boolean flag = false;
-
- HttpURLConnection uc = null;
- int responseCode = 0;
- String contentType = "";
- String host = "";
- int port = 0;
-
- try {
- URL source = new URL(url);
- String protocol = source.getProtocol();
- if (protocol != null && protocol.equals("http")) {
- host = source.getHost();
- port = source.getPort();
- uc = (HttpURLConnection) source.openConnection();
- uc.setConnectTimeout(8000);
- responseCode = uc.getResponseCode();
- contentType = uc.getContentType();
- }
- } catch (MalformedURLException mue) {
- System.err.println("Invalid URL : " + url);
- } catch (IOException ie) {
- if (ie instanceof UnknownHostException) {
- System.err.println("UnknowHost : " + url);
- } else if (ie instanceof SocketException) {
- System.err.println("Socket Error : " + ie.getMessage() + " "
- + url);
- } else if (ie instanceof SocketTimeoutException) {
- System.err.println("Socket Connection Time Out : " + url);
- } else if (ie instanceof FileNotFoundException) {
- System.err.println("broken link "
- + ((FileNotFoundException) ie.getCause()).getMessage()
- + " ignored");
- } else
- ie.printStackTrace();
- }
-
- if (port == mPort
- && responseCode == HttpURLConnection.HTTP_OK
- && host.equals(mHost)
- && (contentType.startsWith("text/html") || contentType
- .startsWith("text/plain")))
- flag = true;
- return flag;
- }
-
-
- public synchronized String dequeueURL() {
- while (true) {
- if (mPages.size() > 0) {
- String url = (String) mPages.remove(0);
- mFinished.add(url);
-
- if (isToBeCaptured(url)) {
- int bookmark;
- NodeList list;
- NodeList robots;
- MetaTag robot;
- String content;
- try {
- bookmark = mPages.size();
-
- mParser.setURL(url);
- try {
- list = new NodeList();
- for (NodeIterator e = mParser.elements(); e
- .hasMoreNodes();)
- list.add(e.nextNode());
- } catch (EncodingChangeException ece) {
-
- mParser.reset();
- list = new NodeList();
- for (NodeIterator e = mParser.elements(); e
- .hasMoreNodes();)
- list.add(e.nextNode());
- }
-
-
-
-
- robots = list
- .extractAllNodesThatMatch(
- new AndFilter(new NodeClassFilter(
- MetaTag.class),
- new HasAttributeFilter("name",
- "robots")), true);
- if (0 != robots.size()) {
- robot = (MetaTag) robots.elementAt(0);
- content = robot.getAttribute("content")
- .toLowerCase();
- if ((-1 != content.indexOf("none"))
- || (-1 != content.indexOf("nofollow")))
- for (int i = bookmark; i < mPages.size(); i++)
- mPages.remove(i);
- }
- } catch (ParserException pe) {
- pe.printStackTrace();
- }
- }
- return url;
- } else {
- mthreads--;
- if (mthreads > 0) {
- try {
- wait();
- mthreads++;
- } catch (InterruptedException ie) {
- ie.printStackTrace();
- }
- } else {
- notifyAll();
- return null;
- }
- }
- }
- }
-
-
-
-
- protected void process(String url) {
-
- String result[];
- String content = null;
- String title = null;
-
-
- if (mCheck) {
- try {
- TermQuery query = new TermQuery(new Term("url", url));
- Hits hits = indexSearcher.search(query);
- if (hits.length() > 0) {
- System.out.println("The URL : " + url
- + " has already been captured");
- } else {
- result = parseHtml(url, charset);
- content = result[0];
- title = result[1];
- }
- } catch (IOException ie) {
- ie.printStackTrace();
- }
- } else {
- result = parseHtml(url, charset);
- content = result[0];
- title = result[1];
- }
-
- if (content != null && content.trim().length() > 0) {
-
- Document document = new Document();
- document.add(new Field("content", content, Field.Store.YES,
- Field.Index.TOKENIZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS));
- document.add(new Field("url", url, Field.Store.YES,
- Field.Index.UN_TOKENIZED));
- document.add(new Field("title", title, Field.Store.YES,
- Field.Index.TOKENIZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS));
- document.add(new Field("date", DateTools.timeToString(new Date()
- .getTime(), DateTools.Resolution.DAY), Field.Store.YES,
- Field.Index.UN_TOKENIZED));
-
- synchronized (indexLock) {
- try {
- RAMWriter.addDocument(document);
-
-
-
-
- if (RAMWriter.ramSizeInBytes() > 512 * 1024) {
- RAMWriter.close();
- FSDWriter.addIndexes(new Directory[] { ramDirectory });
- RAMWriter = new IndexWriter(ramDirectory,
- luceneAnalyzer, true);
- }
- count++;
- System.out.println(Thread.currentThread().getName()
- + ": Finished Indexing URL: " + url);
- } catch (CorruptIndexException cie) {
- cie.printStackTrace();
- } catch (IOException ie) {
- ie.printStackTrace();
- }
- }
- }
- }
-
-
-
-
-
- class LocalLinkTag extends LinkTag {
- public void doSemanticAction() {
-
- String link = getLink();
- if (link.endsWith("/"))
- link = link.substring(0, link.length() - 1);
- int pos = link.indexOf("#");
- if (pos != -1)
- link = link.substring(0, pos);
-
-
- if (!(mFinished.contains(link) || mPages.contains(link)))
- mPages.add(link);
-
- setLink(link);
- }
- }
-
-
-
-
-
- class LocalFrameTag extends FrameTag {
- public void doSemanticAction() {
-
- String link = getFrameLocation();
- if (link.endsWith("/"))
- link = link.substring(0, link.length() - 1);
- int pos = link.indexOf("#");
- if (pos != -1)
- link = link.substring(0, pos);
-
-
- if (!(mFinished.contains(link) || mPages.contains(link)))
- mPages.add(link);
-
- setFrameLocation(link);
- }
- }
-
-
-
-
-
- class LocalBaseHrefTag extends BaseHrefTag {
-
- public String toHtml() {
- return ("");
- }
- }
-
-
- protected String autoDetectCharset(URL url) {
-
- CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
-
-
-
-
-
- detector.add(new ParsingDetector(false));
- detector.add(JChardetFacade.getInstance());
- detector.add(ASCIIDetector.getInstance());
- detector.add(UnicodeDetector.getInstance());
-
- Charset charset = null;
- try {
- charset = detector.detectCodepage(url);
- } catch (MalformedURLException mue) {
- mue.printStackTrace();
- } catch (IOException ie) {
- ie.printStackTrace();
- }
- if (charset == null)
- charset = Charset.defaultCharset();
- return charset.name();
- }
-
-
- protected String[] parseHtml(String url, String charset) {
-
- String result[] = null;
- String content = null;
-
- try {
- URL source = new URL(url);
- InputStream in = source.openStream();
- BufferedReader reader = new BufferedReader(new InputStreamReader(
- in, charset));
- String line = new String();
- StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
- while ((line = reader.readLine()) != null) {
- temp.append(line);
- temp.append(lineSep);
- }
- reader.close();
- in.close();
- content = temp.toString();
- } catch (MalformedURLException mue) {
- System.err.println("Invalid URL : " + url);
- } catch (UnsupportedEncodingException uee) {
- uee.printStackTrace();
- } catch (IOException ie) {
- if (ie instanceof UnknownHostException) {
- System.err.println("UnknowHost : " + url);
- } else if (ie instanceof SocketException) {
- System.err.println("Socket Error : " + ie.getMessage() + " "
- + url);
- } else if (ie instanceof SocketTimeoutException) {
- System.err.println("Socket Connection Time Out : " + url);
- } else
- ie.printStackTrace();
- }
-
- if (content != null) {
- Parser myParser = Parser.createParser(content, charset);
- HtmlPage visitor = new HtmlPage(myParser);
- try {
- myParser.visitAllNodesWith(visitor);
- String body = null;
- String title = "Untitled";
- if (visitor.getBody() != null) {
- NodeList nodelist = visitor.getBody();
- body = nodelist.asString().trim();
- }
- if (visitor.getTitle() != null)
- title = visitor.getTitle();
- result = new String[] { body, title };
- } catch (ParserException pe) {
- pe.printStackTrace();
- }
- }
- return result;
- }
-
- public static void main(String[] args) {
- SiteCapturer worker = new SiteCapturer();
-
- if (args.length < 6)
- throw new IllegalArgumentException(
- "Usage:java -u [start url] -d [index dir] -t [threads] [-c]");
-
- for (int i = 0; i < args.length; i++) {
- if (args[i].equals("-u"))
- worker.setSource(args[++i]);
- else if (args[i].equals("-d"))
- worker.setTarget(args[++i]);
- else if (args[i].equals("-t"))
- worker.setThreads(Integer.parseInt(args[++i]));
- else if (args[i].equals("-c"))
- worker.setMCheck(true);
- }
-
- if (worker.getThreads() < 1)
- throw new IllegalArgumentException("Invalid number of threads: "
- + worker.getThreads());
-
- worker.capture();
- System.exit(0);
- }
- }
package com.huizhi.kanine.util;
import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URL; import java.net.UnknownHostException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Date; import java.util.HashSet;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.MetaTag; import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.HtmlPage;
import cpdetector.io.ASCIIDetector; import cpdetector.io.CodepageDetectorProxy; import cpdetector.io.JChardetFacade; import cpdetector.io.ParsingDetector; import cpdetector.io.UnicodeDetector;
/** * @author 张波 * E-mail:kaninebruno@hotmail.com * Created On : 2008-03-30 */ public class SiteCapturer implements Runnable{ /* 基准(初始)URL */ protected URL mSource;
/* 索引文件的存放位置 */ protected String mTarget;
/** * 待解析的URL地址集合,所有新检测到的链接均存放于此; * 解析时按照先入先出(First-In First-Out)法则线性取出 */ protected ArrayList mPages;
/* 已解析的URL地址集合,避免链接的重复抓取 */ protected HashSet mFinished;
protected Parser mParser; /* StringBuffer的缓冲区大小 */ protected final int TRANSFER_SIZE = 4096; /* 当前平台的行分隔符 */ protected static String lineSep = System.getProperty("line.separator"); /* 程序运行线程数,默认2个线程 */ protected int mthreads; protected ArrayList threadList; /* 存储于磁盘的IndexWriter */ protected IndexWriter FSDWriter; /* 存储于内存的IndexWriter */ protected IndexWriter RAMWriter;
protected IndexSearcher indexSearcher;
protected RAMDirectory ramDirectory; /* 筛选页面内容的分词器 */ protected Analyzer luceneAnalyzer;
/* 解析页面时的字符编码 */ protected String charset; /* 统计已抓取的页面数量 */ protected int count = 0; /* 基准端口 */ protected int mPort; /* 基准主机 */ protected String mHost; /* 检测索引中是否存在当前URL信息,避免重复抓取 */ protected boolean mCheck;
/* 索引操作的写入线程锁 */ public static final Object indexLock = new Object(); public SiteCapturer() { mSource = null; mTarget = null; mthreads = 2; mCheck = false; mPages = new ArrayList(); mFinished = new HashSet(); mParser = new Parser(); PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); factory.registerTag(new LocalLinkTag()); factory.registerTag(new LocalFrameTag()); factory.registerTag(new LocalBaseHrefTag()); mParser.setNodeFactory(factory); }
public String getSource() { return mSource.toString(); }
public void setSource(String source) { if (source.endsWith("/")) source = source.substring(0, source.length() - 1); try { mSource = new URL(source); } catch (MalformedURLException e) { System.err.println("Invalid URL : " + getSource()); } }
public String getTarget() { return (mTarget); }
public void setTarget(String target) { mTarget = target; } public int getThreads() { return (mthreads); }
public void setThreads(int threads) { mthreads = threads; } public boolean isMCheck() { return mCheck; }
public void setMCheck(boolean check) { mCheck = check; }
/** * 程序入口,在此初始化mPages、IndexWriter * 通过协调各线程间的活动完成website的抓取工作 * 任务完成后将所有的索引片段合并为一个以优化检索 */ public void capture(){
mPages.clear(); mPages.add(getSource()); int responseCode = 0; String contentType = ""; try { HttpURLConnection uc = (HttpURLConnection) mSource.openConnection(); responseCode = uc.getResponseCode(); contentType = uc.getContentType(); } catch (MalformedURLException mue) { System.err.println("Invalid URL : " + getSource()); } catch (IOException ie) { if (ie instanceof UnknownHostException) { System.err.println("UnknowHost : " + getSource()); } else if (ie instanceof SocketException) { System.err.println("Socket Error : " + ie.getMessage() + " " + getSource()); } else ie.printStackTrace(); } if (responseCode == HttpURLConnection.HTTP_OK && contentType.startsWith("text/html")) { mPort = mSource.getPort(); mHost = mSource.getHost(); charset = autoDetectCharset(mSource);
/* 存放索引文件的位置 */ File indexDir = new File(mTarget); /* 标记是否重新建立索引,true为重新建立索引 */ boolean flag = true; if (!indexDir.exists()) { /* 如果文件夹不存在则创建 */ indexDir.mkdir(); } else if (IndexReader.indexExists(mTarget)) { /* 如果已存在索引,则追加索引 */ flag = false; File lockfile = new File(mTarget + File.separator + "write.lock"); if (lockfile.exists()) lockfile.delete(); } luceneAnalyzer = new MMAnalyzer(); ramDirectory = new RAMDirectory();
try { FSDWriter = new IndexWriter(indexDir, luceneAnalyzer, flag); RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true); while (mCheck) { IndexReader indexReader = IndexReader.open(mTarget); indexSearcher = new IndexSearcher(indexReader); } long start = System.currentTimeMillis(); threadList = new ArrayList();
for (int i = 0; i < mthreads; i++) { Thread t = new Thread(this, "K-9 Spider Thread #" + (i + 1)); t.start(); threadList.add(t); } while (threadList.size() > 0) { Thread child = (Thread) threadList.remove(0); try { child.join(); } catch (InterruptedException e) { e.printStackTrace(); } } long elapsed = System.currentTimeMillis() - start;
RAMWriter.close(); FSDWriter.addIndexes(new Directory[] { ramDirectory }); FSDWriter.optimize(); FSDWriter.close();
System.out.println("Finished in " + (elapsed / 1000) + " seconds"); System.out.println("The Count of the Links Captured is " + count); } catch (CorruptIndexException cie) { cie.printStackTrace(); } catch (LockObtainFailedException lofe) { lofe.printStackTrace(); } catch (IOException ie) { ie.printStackTrace(); } } } public void run() { String url; while ((url = dequeueURL()) != null) { if (isToBeCaptured(url)) process(url); } mthreads--; }
/** * 判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain */ public boolean isToBeCaptured (String url){ boolean flag = false; HttpURLConnection uc = null; int responseCode = 0; String contentType = ""; String host = ""; int port = 0; try { URL source = new URL(url); String protocol = source.getProtocol(); if (protocol != null && protocol.equals("http")) { host = source.getHost(); port = source.getPort(); uc = (HttpURLConnection) source.openConnection(); uc.setConnectTimeout(8000); responseCode = uc.getResponseCode(); contentType = uc.getContentType(); } } catch (MalformedURLException mue) { System.err.println("Invalid URL : " + url); } catch (IOException ie) { if (ie instanceof UnknownHostException) { System.err.println("UnknowHost : " + url); } else if (ie instanceof SocketException) { System.err.println("Socket Error : " + ie.getMessage() + " " + url); } else if (ie instanceof SocketTimeoutException) { System.err.println("Socket Connection Time Out : " + url); } else if (ie instanceof FileNotFoundException) { System.err.println("broken link " + ((FileNotFoundException) ie.getCause()).getMessage() + " ignored"); } else ie.printStackTrace(); } if (port == mPort && responseCode == HttpURLConnection.HTTP_OK && host.equals(mHost) && (contentType.startsWith("text/html") || contentType .startsWith("text/plain"))) flag = true; return flag; }
/* 从URL队列mPages里取出单个的URL */ public synchronized String dequeueURL() { while (true) { if (mPages.size() > 0) { String url = (String) mPages.remove(0); mFinished.add(url); if (isToBeCaptured(url)) { int bookmark; NodeList list; NodeList robots; MetaTag robot; String content; try { bookmark = mPages.size(); /* 获取页面所有节点 */ mParser.setURL(url); try { list = new NodeList(); for (NodeIterator e = mParser.elements(); e .hasMoreNodes();) list.add(e.nextNode()); } catch (EncodingChangeException ece) { /* 解码出错的异常处理 */ mParser.reset(); list = new NodeList(); for (NodeIterator e = mParser.elements(); e .hasMoreNodes();) list.add(e.nextNode()); } /** * 依据 http://www.robotstxt.org/wc/meta-user.html 处理 * Robots tag */ robots = list .extractAllNodesThatMatch( new AndFilter(new NodeClassFilter( MetaTag.class), new HasAttributeFilter("name", "robots")), true); if (0 != robots.size()) { robot = (MetaTag) robots.elementAt(0); content = robot.getAttribute("content") .toLowerCase(); if ((-1 != content.indexOf("none")) || (-1 != content.indexOf("nofollow"))) for (int i = bookmark; i < mPages.size(); i++) mPages.remove(i); } } catch (ParserException pe) { pe.printStackTrace(); } } return url; } else { mthreads--; if (mthreads > 0) { try { wait(); mthreads++; } catch (InterruptedException ie) { ie.printStackTrace(); } } else { notifyAll(); return null; } } } }
/** * 处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行 */ protected void process(String url) { String result[]; String content = null; String title = null;
/* 此项操作较耗性能,故默认不予检测 */ if (mCheck) { try { TermQuery query = new TermQuery(new Term("url", url)); Hits hits = indexSearcher.search(query); if (hits.length() > 0) { System.out.println("The URL : " + url + " has already been captured"); } else { result = parseHtml(url, charset); content = result[0]; title = result[1]; } } catch (IOException ie) { ie.printStackTrace(); } } else { result = parseHtml(url, charset); content = result[0]; title = result[1]; } if (content != null && content.trim().length() > 0) {
Document document = new Document(); document.add(new Field("content", content, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); document.add(new Field("url", url, Field.Store.YES, Field.Index.UN_TOKENIZED)); document.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); | |