# 用 ts 编写小爬虫

# 初始化项目

创建 package.json 和 tsconfig.json，便捷的命令是npm init -y和tsc --init，当然你也可以手动创建手动添加配置项。

# 使用 superagent

superagent是一个请求代理模块，可以使用它在 node 环境中发送 ajax 请求。

先安装它，npm install superagent --save，然后再安装它的.d.ts类型声明文件，npm install @types/superagent --D。

我们主要使用它的 superagent.get 方法，会去请求链接，然后返回我们要爬取的内容。

private async getSourHtml(): Promise<string> {
    const response: superagent.Response = await superagent.get(this._url);
    return response.text;
}

1
2
3
4

# 使用 cheerio

我们需要对爬取到的 html 内容进行分析提取，可以通过cheerio来获取 html 内容里的模块组成。

private getRankInfo(html: string): IrankInfo {
    // 将html字符串用cheerio转为dom元素
    this._$ = cheerio.load(html);
    // 文章信息在id为allupdate的ul里
    const rankHtml: cheerio.Cheerio = this._$('.rank_main');
    // console.log('rankHtml', rankHtml.html());
    // 最后要返回的值
    const rankInfo: IrankInfo = { name: '', list: new Array<Irank>() };
    // 标题
    const nameHtml = this._$(rankHtml).find('.rank_banner .rank_i_title_name')?.children();
    rankInfo.name = nameHtml ? nameHtml.text() : '人气排行榜';
    // 榜单清单
    const listHtml = this._$(rankHtml).find('.rank_i_lists .rank_i_p_list');
    listHtml.map((index, value) => {
        if ([0, 2, 4, 5].includes(index)) { // 只收取有点击数的，其他的暂时不作处理
            const name: string = unescapeCode(this._$(value).find('.rank_i_p_tit').text());
            const list: Ibook[] = this.handleBookInfo(this._$(value).find('.rank_i_li'));
            rankInfo.list.push({ name, list });
        }
    });
    return rankInfo;
}
private handleBookInfo(obj: cheerio.Cheerio): Ibook[] {
    const bookArr: Ibook[] = [];
    obj.map((index, value) => {
        let num = '';
        let bookName = '';
        let bookCount = '';
        if (index === 0) {
            const data = this._$(value).children();
            num = unescapeCode(data.eq(0).text());
            bookName = unescapeCode(data.eq(1).children().eq(0).text());
            bookCount = unescapeCode(data.eq(1).children().eq(3).text());
            bookCount = bookCount.replace(/[^0-9]/ig, ''); // 将非数字去掉
        } else {
            const data = this._$(value).children();
            num = unescapeCode(data.eq(0).text());
            bookName = unescapeCode(data.eq(1).children().eq(0).text());
            bookCount = unescapeCode(data.eq(2).text());
        }
        bookArr.push({ num, bookName, bookCount });
    });
    return bookArr;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

# 使用 node 自带的 fs 处理文件

首先是读取旧文件里的东西：fs.readFileSync(filePath, 'utf-8')，并将它转成 json 对象，再将新内容添加到 json 对象里，最后又将 json 对象转为 json 字符串。后面由于更改了需求，没有必要追加到以前的数据上，直接使用新数据。

private generateJsonData(rankInfo: IrankInfo, filePath: string): string {
    let fileContent: IrankInfo = null;
    /* if (fs.existsSync(filePath)) {
        // 旧数据：从旧文件中读取的是json字符串，转成json
        fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
    } */
    // 构造新数据，返回记得转成json字符串。其实旧数据被覆盖了，其实也不需要旧数据
    fileContent = rankInfo;

    return JSON.stringify(fileContent);
}

1
2
3
4
5
6
7
8
9
10
11

有了 json 字符串后，需要重新将它写入文件

private writeFile(jsonData: string) {
    fs.writeFileSync(this._filePath, jsonData);
}

1
2
3

# 模式优化

使用组合模式将分析 html 那块单独抽离，并且抽象出一个顶级的“分析器”，我们需要什么样的分析器再实现什么样的分析器。

使用单例模式来用分析器，在全局中只要有分析器的一个实例就可以了，只需调用单例的公共方法。

# 爬虫代码清单

crawler.ts

import fs from "fs";
import path from "path";
import superagent from "superagent";

export interface IAnalyzer {
  analysis: (html: string, filePath: string) => string;
}

/**
 * @description 使用ts写一个爬虫(优化crawler1.ts)
 * @author LiawnLiu
 * @version 1.0.0
 */
export default class Crawler {
  private _filePath: string = path.resolve(__dirname, "../../data/book.json");
  constructor(private _url: string, private _analyzer: IAnalyzer) {
    // 开始爬取
    this.startCrawler();
  }
  /**
   * @description 总控制
   */
  private async startCrawler() {
    // 获取网站静态html内容
    const html: string = await this.getSourHtml();
    // 分析器分析
    const jsonData: string = this._analyzer.analysis(html, this._filePath);
    // 将json数据写入文件
    this.writeFile(jsonData);
  }
  /**
   * @description 获取网站静态html内容
   * @returns html内容，隐式转换为Promise了
   */
  private async getSourHtml(): Promise<string> {
    const response: superagent.Response = await superagent.get(this._url);
    return response.text;
  }
  /**
   * @description 将json数据写入文件
   */
  private writeFile(jsonData: string) {
    fs.writeFileSync(this._filePath, jsonData);
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

bookAnalyzer.ts

import cheerio from "cheerio";
import { IAnalyzer } from "./crawler";
import { unescapeCode } from "../utils/escapeCode";

// 汇总信息
interface IrankInfo {
  name: string; // 名字or标题
  list: Irank[]; // 具体内容
}
interface Irank {
  name: string; // 名字or标题
  list: Ibook[]; // 具体内容
}
// 小说信息
interface Ibook {
  num: string;
  bookName: string;
  bookCount: string;
}

export default class BookAnalyzer implements IAnalyzer {
  // 单例模式
  private static _instance: BookAnalyzer;
  public static get I(): BookAnalyzer {
    return this._instance ? this._instance : (this._instance = new BookAnalyzer());
  }
  private _$: cheerio.Root = null;
  private constructor() {}
  public analysis(html: string, filePath: string): string {
    // 对html内容进行处理
    const rankInfo: IrankInfo = this.getRankInfo(html);
    // 将处理后的信息转为json数据
    return this.generateJsonData(rankInfo, filePath);
  }
  /**
   * @description 对html内容进行处理
   * @returns html内容，隐式转换为Promise了
   */
  private getRankInfo(html: string): IrankInfo {
    // 将html字符串用cheerio转为dom元素
    this._$ = cheerio.load(html);
    // 文章信息在id为allupdate的ul里
    const rankHtml: cheerio.Cheerio = this._$(".rank_main");
    // console.log('rankHtml', rankHtml.html());
    // 最后要返回的值
    const rankInfo: IrankInfo = { name: "", list: new Array<Irank>() };
    // 标题
    const nameHtml = this._$(rankHtml).find(".rank_banner .rank_i_title_name")?.children();
    rankInfo.name = nameHtml ? nameHtml.text() : "人气排行榜";
    // 榜单清单
    const listHtml = this._$(rankHtml).find(".rank_i_lists .rank_i_p_list");
    listHtml.map((index, value) => {
      if ([0, 2, 4, 5].includes(index)) {
        // 只收取有点击数的，其他的暂时不作处理
        const name: string = unescapeCode(this._$(value).find(".rank_i_p_tit").text());
        const list: Ibook[] = this.handleBookInfo(this._$(value).find(".rank_i_li"));
        rankInfo.list.push({ name, list });
      }
    });
    return rankInfo;
  }
  private handleBookInfo(obj: cheerio.Cheerio): Ibook[] {
    const bookArr: Ibook[] = [];
    obj.map((index, value) => {
      let num = "";
      let bookName = "";
      let bookCount = "";
      if (index === 0) {
        const data = this._$(value).children();
        num = unescapeCode(data.eq(0).text());
        bookName = unescapeCode(data.eq(1).children().eq(0).text());
        bookCount = unescapeCode(data.eq(1).children().eq(3).text());
        bookCount = bookCount.replace(/[^0-9]/gi, ""); // 将非数字去掉
      } else {
        const data = this._$(value).children();
        num = unescapeCode(data.eq(0).text());
        bookName = unescapeCode(data.eq(1).children().eq(0).text());
        bookCount = unescapeCode(data.eq(2).text());
      }
      bookArr.push({ num, bookName, bookCount });
    });
    return bookArr;
  }
  /**
   * @description 将处理后的文章信息转为json数据
   * @returns 处理后的信息
   */
  private generateJsonData(rankInfo: IrankInfo, filePath: string): string {
    let fileContent: IrankInfo = null;
    /* if (fs.existsSync(filePath)) {
            // 旧数据：从旧文件中读取的是json字符串，转成json
            fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
        } */
    // 构造新数据，返回记得转成json字符串。其实旧数据被覆盖了，其实也不需要旧数据
    fileContent = rankInfo;

    return JSON.stringify(fileContent);
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

# 编译运行

使用的是 ts，最原始的就是先编译后运行，tsc和node src/crawler.js；编译输出路径设置在 tsconfig.json，新添outDir:"./dist"；
如果使用 ts-node 会更方便一点，npm install ts-node -g，局部安装也可以；然后运行ts-node src/crawler.ts；
如果能监听 ts 的变化然后自动编译成 js 就好了，使用tsc -w，再打开 package.json，在 scripts 里添加"build": "tsc -w"；
如果还能自动运行 js 就好了，npm install nodemon -D，打开 package.json，在 scripts 里添加"start": "nodemon node ./dist/crawler.js"；
但运行程序会生成爬虫数据，这样 nodemon 认为文件又编译了就又会运行一遍并无限循环下去，所以让 nodemon 忽略生成的数据，打开 package.json，添加"nodemonConfig":{"ignore":["data/*"]}；
如果又能自动编译又能自动运行就更好了，可以开两个命令行分别运行上面两条命令；也可以使用 concurrently；
npm install concurrently -D，打开 package.json，将 scripts 里的"build"修改为"dev:build"而"start"修改为"dev:start"，并在 scripts 里新添加命令concurrently npm:dev:*，意思是使用 concurrently 来运行所有"dev:*"写法的 npm 命令。

← python 建议 2.用express支撑小爬虫 →