larbin源码分析.jake

larbin源码分析(一) gloabl文件 Connexion结构

larbin源码分析(一) 从gloabl文件分析每一个结构

一本系列主要是分析larbin开源爬虫的源代码，主要思路是先从global文件中的各个重要的结构开始。

1 Connexion 此处为一个结构体

该结构体主要的作用是进行连接服务器的操作。其中析构函数基本不执行，因为此结构是循环利用的，在

程序中保持一定的数量。小扩展：FetchOpen 类主要用来建立连接，而FetchPipe类主要用来进行连接之后的数据交换。

结构体中成员变量

struct Connexion{

char state ; //表示socket的状态EMPTY , CONNECTING , WRITE . OPEN

int pos ; //请求被发送到的位置

FetchError err ; //查询如何终止的

int socket ; // number of the fds

int timeout ; //链接的超时值

LarbinString request ; //http 请求报头

file * parser ; //对下载的网页进行解析

char buffer[maxPageSize] ; //存储下载的网页数据

Connexion() ;

~Connexion() ;

//recycle

void recycle() ; //此处主要进行循环使用

} ;

2 具体成员函数的实现

Connexion::Connexion() //具体将socket的状态变为emptyC

{ //将文件解析句柄变为空

state = emptyC ;

parser = NULL ;

}

Connexion::~Connexion() //保证一旦调用，即报告错误

{

assert(false) ;

}

/*recycle a connexion*/

void Connexion::recycle() //循环使用该结构体

{

delete parser ; //删除解析对象

request.recycle() ; //对LarbinString 调用recycle函数。

}

3 utils包下的connexion.h 和https://www.360docs.net/doc/f412695729.html,的具体代码实现

// Larbin

// Sebastien Ailleret

// 15-11-99 -> 14-12-99

#ifndef CONNEXION_H

#define CONNEXION_H

/* make write until everything is written * return 0 on success, 1 otherwise

* Don't work on non-blocking fds

int ecrire (int fd, char *buf);

/* make write until everything is written * return 0 on success, 1 otherwise

* Don't work on non-blocking fds

int ecrireBuff (int fd, char *buf, int count);

/** Write an int on a fds

* (uses ecrire)

int ecrireInt (int fd, int i);

int ecrireInt2 (int fd, int i);

/** Write an int on a fds

* (uses ecrire)

int ecrireLong (int fd, long i);

/* Write a char on a fds

* return 0 on success, 1 otherwise

* Don't work on non-blocking fds

int ecrireChar (int fd, char c);

#endif// CONNEXION_H

在connexion.h中各个成员函数的作用主要是向套接字中写入数据。

写入操作中主要使用了write 系统调用。

unistd.h中

ssize_t write(int fd , char * buf , int count)

若是发生写错误，则返回值为-1 ，但是若此时的错误状态为EINTR ,则表示是发生了中断操作，此时应该继续进行写操作。

若是当前执行的写操作出现了等待的事情，则不需要报错，应该继续写，直到等待的事情结束。

(1) 误区

write并不是立即执行写操作，而是将数据写入进内核缓冲区。

一般内核区比较稳定，不会出现问题。

（4）下面是connexion的实现代码

#include

#include "options.h"

using namespace std ;

/*********************************/ /* various functions for writing */

/* make write until everything is written * return 0 on success, 1 otherwise

* Don't work on non-blocking fds

int ecrire (int fd, char *buf) {

int pos = 0 ;

int len = strlen(buf);

while(pos < len)

if(i == -1)

{

if(errno != EINTR)

{

pos = len + 1 ;

}

else{

pos += i ;

}

return pos != len ;

}

/* make write until everything is written

* return 0 on success, 1 otherwise

* Don't work on non-blocking fds

int ecrireBuff (int fd, char *buf, int count) { int pos = 0;

while(pos < count)

{

int i = write(fd , buf + pos , count - pos) ;

if(i == -1)

{

case EINTR :

break ;

default:

pos = count + 1 ; perror("buf error") ;

break;

}

else

pos += i ;

}

return pos != count;

}

/** Write an int on a fds * (uses ecrire)

int ecrireInt (int fd, int i) { char buf[20];

sprintf(buf, "%d", i); return ecrire(fd, buf);

int ecrireInt2 (int fd, int i) {

char buf[20];

sprintf(buf, "%d%c", i/10, i%10 + '0'); return ecrire(fd, buf);

}

int ecrireInti (int fd, int i, char *f) { char buf[100];

sprintf(buf, f, i);

return ecrire(fd, buf);

}

int ecrireIntl (int fd, long i, char *f) { char buf[100];

sprintf(buf, f, i);

return ecrire(fd, buf);

}

/** Write an int on a fds

* (uses ecrire)

int ecrireLong (int fd, long i) {

char buf[30];

sprintf(buf, "%ld", i);

/* Write a char on a fds

* return 0 on success, 1 otherwise

* Don't work on non-blocking fds

int ecrireChar (int fd, char c) {

int pos = 0;

while (pos < 1) {

int i = write(fd, &c, 1);

if (i == -1) {

if (errno != EINTR) {

pos = 2;

}

} else {

pos += i;

}

return pos != 1;

}

（5）综上

Connexion主要处理的是连接相关的信息，其connexion中主要实现的是，向套接字中写入数据。

下一个系列，处理的是LarbinString 相关，该类主要是处理http报头的。

LARBIN源码分析(二) GLOABL文件LARBINSTRING类对象的实现LARBINSTRING类对象的实现

一该类介绍

LarbinString类主要是字符串处理，主要的成员参数是char * chaine 表示字符串的内容存储的指针地址。

还有pos 表示当前string的位置，size表示最大的容量。

成员函数，都为一些添加字符，添加缓冲区的操作。其中的主要的是recycle() ，getString() ，giveStirng()等函数。

二类的头文件

// Larbin

// Sebastien Ailleret

// 20-12-99 -> 05-05-01

#ifndef STRING_HH

#define STRING_HH

#include

#include "types.h"

#include "utils/debug.h"

class LarbinString {

private :

char * chaine ; //内存指针

uint pos ; //当前位置

uint size ; //总共的大小

public :

//Constructor

LarbinString(uint size= STRING_SIZE) ;

~LarbinString() ;

//Recycle this string

void recycle(uint size=STRING_SIZE) ;

//get the char *

//it is deleted when you delete this Stringobject

char * getString() ;

//give a char * : it creates a new one

char * giveString();

//append a char

void addChar(char c) ;

//append a char *

void addString(char * s) ;

//append a buffer

void addBuffer(char * s , uint len) ;

//length of the string

inline uint getLength() {return pos ;}

return chaine[i] ;

}

//change a char

void setChar(uint i , char c) ;

};

#endif// STRING_HH

三实现代码

该代码实质上是实现了，一个string类型，可以自动地增长容量，实现动态地增添操作。

// Larbin

// Sebastien Ailleret

// 20-12-99 -> 10-12-01

#include

#include "options.h"

#include "utils/text.h"

#include "utils/string.h"

using namespace std ;

// Constructor

LarbinString::LarbinString (uint size) {

chaine = new char[size] ;

pos = 0 ;

this->size = size ;

chaine[0] = 0 ;

}

// Destructor

LarbinString::~LarbinString () {

delete [] chaine ;

}

// Recycle this string

void LarbinString::recycle (uint size) {

if(this->size > size) //当大小小于当前的大小时

{

delete [] chaine ;

pos = 0 ;

chaine[0] = 0 ;

}

// get the char *

char *LarbinString::getString () {

return chaine ;

}

/** give a new string (allocate a new one

* the caller will have to delete it

char *LarbinString::giveString () {

return newString(chaine) ;

}

// append a char

void LarbinString::addChar (char c) {

chaine[pos] = c;

pos++ ;

if(pos >= size) //如果当前的

{

char * tmp = new char[2 * size] ;

memcpy(tmp , chaine , pos) ;

delete [] chaine ;

chaine = tmp ;

size *= 2 ;

}

chaine[pos] = 0 ;

}

// append a char *

void LarbinString::addString (char *s) {

uint len = strlen(s);

addBuffer(s, len);

}

// append a buffer

void LarbinString::addBuffer (char *s, uint len) { if (size <= pos + len) {

size *= 2;

if (size <= pos + len) size = pos + len + 1;

char *tmp = new char[size];

memcpy(tmp, chaine, pos);

delete [] chaine;

chaine = tmp;

}

memcpy(chaine+pos, s, len);

// change a char

void LarbinString::setChar (uint i, char c) {

chaine[i] = c;

}

四总结

LarbinString类主要进行的是字符串处理，实质上是自己实现了一个String库。

LARBIN源代码分析<三> URL类分析

一分析utils包中的url类

该类代表实际中的一个url，成员变量主要有，char * file ，char * host , uint16_t port , int8_t depth, char * cookie

还有一个public 属性的in_addr 表示一个ipv4的地址。

成员函数中主要有一些，比如构造函数，返回url，添加cookie等操作。二实例代码如下

// Larbin

// Sebastien Ailleret

// 15-11-99 -> 14-03-02

/* This class describes an URL */

#ifndef URL_H

#define URL_H

#include

#include "types.h"

bool fileNormalize (char *file);

class url {

private:

char *host;

char *file;

uint16_t port; // the order of variables is important for physical s ize

int8_t depth;

/* parse the url */

void parse (char *s);

/** parse a file with base */

void parseWithBase (char *u, url *base);

/* constructor used by giveBase */

url (char *host, uint port, char *file);

public:

/* Constructor : Parses an url (u is deleted) */

url (char *u, int8_t depth, url *base);

/* constructor used by input */

url (char *line, int8_t depth);

/* Constructor : read the url from a file (cf serialize) */

url (char *line);

/* Destructor */

~url ();

/* inet addr (once calculated) */

struct in_addr addr;

/* Is it a valid url ? */

bool isValid ();

/* print an URL */

void print ();

/* return the host */

inline char *getHost () { return host; }

/* return the port */

inline uint getPort () { return port; }

/* return the file */

inline char *getFile () { return file; }

/** Depth in the Site */

inline int8_t getDepth () { return depth; }

/* Set depth to max if we are at an entry point in the site * try to find the ip addr

* answer false if forbidden by robots.txt, true otherwise */ bool initOK (url *from);

/** return the base of the url

* give means that you have to delete the string yourself */

url *giveBase ();

/** return a char * representation of the url

* give means that you have to delete the string yourself

* buf must be at least of size maxUrlSize

* returns the size of what has been written (not including '\0') */

int writeUrl (char *buf);

/* serialize the url for the Persistent Fifo */

char *serialize ();

/* very thread unsafe serialisation in a static buffer */

char *getUrl();

/* return a hashcode for the host of this url */

uint hostHashCode ();

/* return a hashcode for this url */

uint hashCode ();

#ifdef URL_TAGS

/* tag associated to this url */

uint tag;

#endif// URL_TAGS

#ifdef COOKIES

/* cookies associated with this page */

char *cookie;

void addCookie(char *header);

#else// COOKIES

inline void addCookie(char *header) {}

#endif// COOKIES

};

#endif// URL_H

三代码分析

url中的实现类主要是，创建url，其中创建规则如下：

https://www.360docs.net/doc/f412695729.html,/r/0343/ttt.html

则host为https://www.360docs.net/doc/f412695729.html,, file 为/r/0343/ttt.html

url的构造函数即根据上述规则构建url类。若是含有base url 则新的url 的file为base->file + 新url 的file。

（2）另外url的hash函数即是利用了file 字符串和host字符串。

/* return a hashcode for this url */

uint url::hashCode () {

unsigned int h=port;

unsigned int i=0;

while (host[i] != 0) {

h = 31*h + host[i];

i++;

}

i=0;

return h % hashSize;

}

(3) cookie的处理函数如下

若addCookie(char * head) 中的head字符串是以 set-cookie: 开始的，则将head之后的12个字符

添加到cookie变量中。

四综上：

url 类中的成员变量，char * file ,char * host , port , cookie 能够表示一个url。

并且url类中提供了解析函数，使用户可以根据从网页中爬取的url构造url 类对象。

LARBIN源码分析(四) HASHTABLE类对象的实现

LARBIN源码分析(四)HASHTABLE类对象的实现

一hashTable类对象

作用：爬虫将爬取的url存储在该类对象中，存储方式是，url对象的hashcode，映射到对应的表项中。

其中每一个url，映射成对应table中的一个bit，其中表空间大小为64000000（单位为bit）,大小的定义存储在types.h头文件中。

爬去到的url需要在hashTable中进行一次查找，若是不存在，则进行进一步遍历。已经存在，则不需要遍历。

二成员函数

HashTable(bool created ) 构造函数

~HashTable() 析构函数

save() ; 将hashTable存储在文件中。

test(url * U) ;判断对应的url是否在hashTable 中

set(url * u) ; 将hashTable对应的URL ，设置称当前的。

testset(url* u) 如果已经添加返回true 。若是之前已经存在返回false 三实现

(1) HashTable(bool created)

当created为true的时候，表示不需要从文件中读取，直接进行初始化，将table区域全部初始化为0即可。

若created为false的时候，表示需要从文件hashtable.bak中读取存储的数据，将其存放进hashtable类中的table缓冲区中。

(2) save() 函数实现

rename("hashtable.bak", "hashtable.old"); //将hashtable.bak文件存储在临时文件hashtable.old中

int fds = creat("hashtable.bak", 00600); //新创建hashtable.bak文件

if (fds >= 0) {

ecrireBuff(fds, table, hashSize/8); //将hashtable写入文件

close(fds); //关闭操作

}

unlink("hashtable.old"); //对hashtable.old进行解链操作

（3) bool test(url * U) //判断url是否在hashtable中

{

int code = U->hashcode() ; //调用url的hashcode

int index = 1 << code % 8 ;

return table[pos] & index

}

(4) hashTable::set (url *U) //将对应的一个URL插入进hashtable中

{

int code = U->hashcode() ; //调用url的hashcode 函数。

int pos = code / 8 ; //每一个url在hashtable 中即为一个url

int index = 1 << code % 8 ;

table[pos] |= index ;

}

(5) bool hashTable::testSet (url *U) { //若已经存在此url则返回false，不存在则完成插入，然后返回true

int code = U->hashcode() ; //调用url的hashcode 函数。

int pos = code / 8 ; //每一个url在hashtable 中即为一个url

unsigned int index = 1 << code % 8 ;

int res = table[pos] & index ;

table[pos] |= index ;

return !res ;

}

四总结

hashTable 用来存储爬去下来的url，若是已经存在则不进行插入。

每一个url使用一个bit来进行存储。

LARBIN源码分析(五) HASHDUP类对象的实现

一类的成员变量及主要功能

(1) 成员变量

ssize_t size; //表示hash 表的大小

char *table; //表示hash存储区域

char *file; //表示存储的文件，内存中的hash表可以保存在外部磁盘中

(2) 主要功能

该类和hashTable代码比较相似，但是hashTable处理的是url去重，而hashDup处理的是网页内容的去重，

不会对完全相同的网页进行爬取，但是不保证相似网页的排重。改进的一个方向。

二具体的成员函数

构造函数

hashDup (ssize_t size, char *init, bool scratch); size表示hash 表的大小，单位为bit。

init参数表示hash表存储在磁盘的文件名称。scratch若为true表示重新构建hash表，

若为false，则表示需要从磁盘文件中读取hash表。

~hashDup()函数具体操作为情况table内存。

下面主要讲解网页内容去重的函数

bool hashDup::testSet (char *doc) { //doc应该为网页的具体内容，依次顺序遍历网页内容，对其中A与z之间的字符进行验证

unsigned int code = 0;

char c;

for (uint i=0; (c=doc[i])!=0; i++) {

if (c>'A' && c<'z')

code = (code*23 + c) % size;

}

unsigned int pos = code / 8; //具体的判断函数，若是执行插入返回true，否则返回false

unsigned int bits = 1 << (code % 8);

int res = table[pos] & bits;

table[pos] |= bits;

return !res;

}

save()函数

主要的作用就是，将table区域中的数据，存储在外部磁盘中，进行持久化操作。

三总结

该类为网页内容去重hash函数的具体实现。

LARBIN源码分析(六) LARBIN中线程处理类

一线程类

larbin下的线程操作类，主要在mypthread.h 中定义，实质上是利用宏定义，封装了pthread.h中的系统调用。

一个进程可以有多个线程，每个线程都有自己的处理流程。

二具体实现

typedef void* (*StartFun) (void *);

void startThread (StartFun run, void *arg);

startThread 函数实质上是调用pthread_create 启动一个新的线程。//下面为线程同步的操作

#define mypthread_cond_init(x,y) pthread_cond_init(x,y)

#define mypthread_cond_destroy(x) pthread_cond_destroy(x)

#define mypthread_cond_wait(c,x,y) while (c)

{ pthread_cond_wait(x,y); }

#define mypthread_cond_broadcast(x) pthread_cond_broadcast(x) //下面为线程互斥的操作

#define mypthread_mutex_init(x,y) pthread_mutex_init(x,y)

#define mypthread_mutex_destroy(x) pthread_mutex_destroy(x) #define mypthread_mutex_lock(x) pthread_mutex_lock(x)

#define mypthread_mutex_unlock(x) pthread_mutex_unlock(x)

LARBIN源码分析(七) LARBIN中的2种容器与4个URL队列LARBIN源码分析(七) LARBIN中的2种容器4个URL队列

一larbin中的2中类型的队列

static SyncFifo *URLsPriority; //最高优先级

static SyncFifo *URLsPriorityWait; //次高优先级

static PersistentFifo *URLsDisk; //低优先级

static PersistentFifo *URLsDiskWait; //最低优先级

上述4个URL队列的优先级，由上到下依次递减。

四种url队列的具体使用，留待以后分析。

二下面主要分析SyncFifo

该类实际上为一个同步处理的先进先出的队列。

1 类的主要成员变量

uint in, out; //in表示入队指针，out表示出队指针

uint size; //表示队列的大小

T **tab; //指针的指针，T表示类模板pthread_mutex_t lock; //互斥变量

pthread_cond_t nonEmpty; //互斥条件变量

2 一般同样的队列同步框架

(1) 放入队列中的处理

pthread_mutex_lock(&lock)

//放入队列操作

put() //入队操作

pthrad_cond_signal(&lock) //向取队列线程发送信号

pthread_mutex_unlock(&lock)

(2)从队列中取出

pthread_mutex_lock(&lock)

while(empty)

pthread_cond_wait(&nonEmpty) //执行该句的时候，会释放该Mutex锁lock

//取队列处理

pthread_mutex_unlock(&lock)

3 成员函数

（1）构造函数

template

SyncFifo::SyncFifo (uint size) { //执行的操作：构建队列缓冲区 tab = new T*[size]; //初始化头尾指针，初始化互斥变量

this->size = size;

in = 0;

out = 0;

mypthread_mutex_init (&lock, NULL);

mypthread_cond_init (&nonEmpty, NULL);

}

（2）~SyncFifo()

析构函数执行tab缓冲区的释放，以及互斥变量，互斥条件变量的销