java主要用來做webcrawler,下面以爬取mit.mit為例
1解析根網頁(“mit.edu”),并從此頁面獲取所有鏈接。要訪問每個URL并解析HTML頁面,使用JSoup,這是一個方便和簡單的Java庫,類似于python的soulsoap
2使用從步驟1中檢索的URL,并解析這些URL
3在執行上述步驟時,我們需要跟蹤之前處理過的頁面,以便每個網頁只處理一次。這就是我們需要數據庫的原因
開始用Java爬蟲
1從http://jsoup.org/download下載JSoup核心庫,從http://dev.mysql.com/downloads/connector/j/下載mysqljar包
2現在在eclipse中創建一個名為Crawler的工程,并把JSoup和mysqljar包放到javabuild目錄下
3創建一個名為DB的類,它被用來進行數據庫的相關操作
importjava.sql.Connection;
importjava.sql.DriverManager;
importjava.sql.ResultSet;
importjava.sql.SQLException;
importjava.sql.Statement;
publicclassDB{
publicConnectionconn=null;
publicDB(){
try{
Class.forName("com.mysql.jdbc.Driver");
Stringurl="jdbc:mysql://localhost:3306/Crawler";
conn=DriverManager.getConnection(url,"root","admin213");
System.out.println("connbuilt");
}catch(SQLExceptione){
e.printStackTrace();
}catch(ClassNotFoundExceptione){
e.printStackTrace();
}
}
publicResultSetrunSql(Stringsql)throwsSQLException{
Statementsta=conn.createStatement();
returnsta.executeQuery(sql);
}
publicbooleanrunSql2(Stringsql)throwsSQLException{
Statementsta=conn.createStatement();
returnsta.execute(sql);
}
@Override
protectedvoidfinalize()throwsThrowable{
if(conn!=null||!conn.isClosed()){
conn.close();
}
}
}
4創建一個名為Main的類,這將是我們的爬蟲類
importjava.io.IOException;
importjava.sql.PreparedStatement;
importjava.sql.ResultSet;
importjava.sql.SQLException;
importjava.sql.Statement;
importorg.jsoup.Jsoup;
importorg.jsoup.nodes.Document;
importorg.jsoup.nodes.Element;
importorg.jsoup.select.Elements;
publicclassMain{
publicstaticDBdb=newDB();
publicstaticvoidmain(String[]args)throwsSQLException,IOException{
db.runSql2("TRUNCATERecord;");
processPage("http://www.mit.edu");
}
publicstaticvoidprocessPage(StringURL)throwsSQLException,IOException{
//checkifthegivenURLisalreadyindatabase
Stringsql="select*fromRecordwhereURL='"+URL+"'";
ResultSetrs=db.runSql(sql);
if(rs.next()){
}else{
//storetheURLtodatabasetoavoidparsingagain
sql="INSERTINTO`Crawler`.`Record`"+"(`URL`)VALUES"+"(?);";
PreparedStatementstmt=db.conn.prepareStatement(sql,Statement.RETURN_GENERATED_KEYS);
stmt.setString(1,URL);
stmt.execute();
//getusefulinformation
Documentdoc=Jsoup.connect("http://www.mit.edu/").get();
if(doc.text().contains("research")){
System.out.println(URL);
}
//getalllinksandrecursivelycalltheprocessPagemethod
Elementsquestions=doc.select("a[href]");
for(Elementlink:questions){
if(link.attr("href").contains("mit.edu"))
processPage(link.attr("abs:href"));
}
}
}
}