开发手册 欢迎您!
软件开发者资料库

.NET Core 使用 DotnetSpider 抓取页面教程

本文主要介绍通过DotnetSpider写少量代码快速的实现网页的抓取。

1、 通过Nuget安装引用DotnetSpider

项目上右键 -》选择"管理Nuget程序包" -》搜索"DotnetsSpider" -》点击"DotnetsSpider.Core"安装,还要安装"DotnetSpider.Extension"

Nuget使用教程

2、数据存储EntityPipeline

可以使用框架提供的ConsoleEntityPipeline实现控制台输入,还支持excel、mysql、mongodb等,命名空间在DotnetSpider.Extension.Pipeline下,可以在这个下面查看其它EntityPipeline的实现类,继承自EntityPipeline类,可实现自己的存储逻辑,例如,

    public class StoragePipeline : EntityPipeline
    {
        protected override int Process(List items, dynamic sender = null)
        {
            if (items == null) return 0;
            DateTime dateTime;
            string dateTimeString = string.Empty;
            string path = "./web.txt";
            foreach (var data in items)
            {
                        lock (this)
                        {
                            if (!File.Exists(path))
                            {
                                File.Create(path);
                            }
                            var streamWriter = File.AppendText(path);
                            using (streamWriter)
                            {
                                streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data));
                                streamWriter.WriteLine();
                            }
                        }
                    }
                }
            }
            return items.Count;
        }
    }

3、爬虫类的实现

继承EntitySpider类,来实现爬虫类,继承BaseEntity类实现爬虫实体,实体属性要加上 [Column]标签,通过Field标签写Xpath表达式提取内容,和ReplaceFormatter标签实现内容格式的替换,最后值赋给对应的实体属性,例如,

  private class SpiderWeb : EntitySpider        {            protected override void OnInit(params string[] arguments)            {                var page = 1;                var listRequest = new List();               //循环添加要请求的url                for (int i = 1; i < 500; i++)                {                    page = i;                  listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page),                  new Dictionary { { "page", page } }));                }                AddRequests(listRequest);                AddEntityType();                //AddPipeline(new ConsoleEntityPipeline());               AddPipeline(new StoragePipeline());            }            [Schema("stackoverflow", "stackoverflow_search_entity_model")]            [Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)]            class StackoverflowSearchEntry : BaseEntity            {                [Column]                [Field(Expression = "page", Type = SelectorType.Enviroment)]                public string Page { get; set; }                [Column]                [Field(Expression = ".//div[@class='summary']/h3/a")]                [ReplaceFormatter(NewValue = "", OldValue = "")]                [ReplaceFormatter(NewValue = "", OldValue = "")]                public string Title { get; set; }                [Column]                [Field(Expression = ".//div[@class='summary']/h3/a/@href")]                public string Url { get; set; }                [Column]                [Field(Expression = ".//div[@class='summary']/div[1]")]                public string description { get; set; }                //匹配到的完整的内容                [Column]                [Field(Expression = ".", Option = FieldOptions.InnerText)]                public string PlainText { get; set; }            }        }

4、DotnetSpider使用完整代码

using DotnetSpider.Downloader;using DotnetSpider.Extension;using DotnetSpider.Extension.Model;using DotnetSpider.Extension.Pipeline;using DotnetSpider.Extraction;using DotnetSpider.Extraction.Model;using DotnetSpider.Extraction.Model.Attribute;using DotnetSpider.Extraction.Model.Formatter;using System;using System.Collections.Generic;using System.IO;using System.Runtime.CompilerServices;namespace SpiderContent{    class Program    {        static void Main(string[] args)        {            Console.WriteLine("Hello World!");            var spider = new SpiderWeb();           //每次抓取的时间间隔,防止抓取频过快            spider.SleepTime = 1000;            spider.Run();            Console.ReadKey();        }               private class SpiderWeb : EntitySpider        {            protected override void OnInit(params string[] arguments)            {                var page = 1;                var listRequest = new List();               //循环添加要请求的url                for (int i = 1; i < 500; i++)                {                    page = i;                  listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page),                  new Dictionary { { "page", page } }));                }                AddRequests(listRequest);                AddEntityType();                //AddPipeline(new ConsoleEntityPipeline());               AddPipeline(new StoragePipeline());            }            [Schema("stackoverflow", "stackoverflow_search_entity_model")]            [Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)]            class StackoverflowSearchEntry : BaseEntity            {                [Column]                [Field(Expression = "page", Type = SelectorType.Enviroment)]                public string Page { get; set; }                [Column]                [Field(Expression = ".//div[@class='summary']/h3/a")]                [ReplaceFormatter(NewValue = "", OldValue = "")]                [ReplaceFormatter(NewValue = "", OldValue = "")]                public string Title { get; set; }                [Column]                [Field(Expression = ".//div[@class='summary']/h3/a/@href")]                public string Url { get; set; }                [Column]                [Field(Expression = ".//div[@class='summary']/div[1]")]                public string description { get; set; }                [Column]                [Field(Expression = ".", Option = FieldOptions.InnerText)]                public string PlainText { get; set; }            }        }    }    public class StoragePipeline : EntityPipeline    {        protected override int Process(List items, dynamic sender = null)        {            if (items == null) return 0;            DateTime dateTime;            string dateTimeString = string.Empty;            string path = "./web.txt";            foreach (var data in items)            {                        lock (this)                        {                            if (!File.Exists(path))                            {                                File.Create(path);                            }                            var streamWriter = File.AppendText(path);                            using (streamWriter)                            {                                streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data));                                streamWriter.WriteLine();                            }                        }                    }                }            }            return items.Count;        }    }}