java吧 关注:1,238,955贴子:12,709,239
  • 10回复贴,共1

【转】 一个YD的mini爬虫 有马有真相

只看楼主收藏回复

原帖地址:topic.csdn.net/u/20110915/21/c2047b50-3fa2-4cfa-b91d-1237ccdbb80d.html
据说一分钟下载71张风**PP


IP属地:四川1楼2012-09-03 13:20回复
    public class MiniCrawler {
    private Set<String> alreadyCrawledSet;
    private Queue<String> unCrawlQueue;
    private static Set<String> picSrcSet;
    private static final int MAX_COUNT = 10;
    private String newFolder;
    public MiniCrawler()
    {
    this.alreadyCrawledSet = new HashSet();
    this.unCrawlQueue = new ArrayDeque();
    this.picSrcSet = new HashSet<String>();
    Calendar cal = Calendar.getInstance();
    int month = cal.get(Calendar.MONTH)+1;
    int day = cal.get(Calendar.DAY_OF_MONTH);
    int hour = cal.get(Calendar.HOUR_OF_DAY);
    int minute = cal.get(Calendar.MINUTE);
    int second = cal.get(Calendar.SECOND);
    String folder = month+"_"+day+"_"+hour+"_"+minute+"_"+second;
    newFolder = "C:\\"+folder+"\\";
    File dir = new File(newFolder);
    if( null == dir || !dir.exists())
    {
    dir.mkdir();
    }
    }
    public void crawl(String beginUrl)
    {
    Parser parser = null;
    try {
    parser = new Parser(beginUrl);
    parser.setEncoding(parser.getEncoding());
    NodeList nodeList = parser.parse(null);
    parseNodeList(nodeList);
    }
    catch(ParserException e)
    {
    e.printStackTrace();
    }
    alreadyCrawledSet.add(beginUrl);
    //已经爬完或者已经爬了最大网页数
    while(!unCrawlQueue.isEmpty() && alreadyCrawledSet.size()< MAX_COUNT)
    {
    String newUrl = unCrawlQueue.remove();
    try{
    parser.setResource(newUrl);
    parser.setEncoding(parser.getEncoding());
    NodeList nl = parser.parse(null);
    parseNodeList(nl);
    } catch (ParserException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }
    alreadyCrawledSet.add(newUrl);
    }
    System.out.println("Crawl Finish!");
    FileOutputStream fos = null;
    PrintWriter pw = null;
    //把得到的全部图片地址写到文件中
    try {
    String picFile = newFolder+"picAddrs.txt";
    fos = new FileOutputStream(new File(picFile));
    pw = new PrintWriter(fos);
    Iterator<String> iter = picSrcSet.iterator();
    while(iter.hasNext())
    {
    String str = iter.next().toString();
    pw.write(str);
    pw.flush();
    }
    } catch (FileNotFoundException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }finally
    {
    if(null!=pw)
    {
    pw.close();
    pw = null;
    }
    }
    System.out.println("====== All the pic address have been writen to the file!");
    }
    public void parseNodeList(NodeList nodeList)
    {
    NodeIterator iter = nodeList.elements();
    try {
    while(iter.hasMoreNodes())
    {
    Node node = iter.nextNode();
    if(node.getClass() == LinkTag.class)
    {
    LinkTag tag = (LinkTag)node;
    String href = tag.getLink();
    //System.out.println("===find a link: "+href);
    if(null != href)
    {
    if(!alreadyCrawledSet.contains(href) && (href.indexOf("www.169pp.com")!=-1) && href.endsWith("htm"))
    


    IP属地:四川2楼2012-09-03 13:20
    回复
      好的支持下……
      话说你怎么全是转贴……


      IP属地:江苏4楼2012-09-03 13:21
      收起回复
        这个。。。


        5楼2012-09-03 13:33
        回复
          又多YD


          IP属地:北京6楼2012-09-03 13:53
          回复
            马克


            IP属地:河南来自手机贴吧7楼2012-09-04 01:39
            回复
              马克


              IP属地:浙江来自手机贴吧8楼2012-09-04 08:21
              回复
                看了你很喜欢搜索.


                9楼2012-09-04 09:42
                回复