利用二叉排序树统计Apache日志中的访问IP信息

二叉排序树(Binary Sort Tree),又称二叉查找树(Binary Search Tree),亦称二叉搜索树。为了统计Apache日志中的访问IP信息,我们采用二叉排序树,主要原因是二叉排序树相比于其他数据结构的优势在于查找、插入的时间复杂度较低。

二叉排序树或者是一棵空树,或者是具有下列性质的二叉树:
(1)若左子树不空,则左子树上所有结点的值均小于或等于它的根结点的值;
(2)若右子树不空,则右子树上所有结点的值均大于或等于它的根结点的值;
(3)左、右子树也分别为二叉排序树。

一、二叉排序树插入

void insert_data_into_tree(struct tree_node **p, int data)
{
	if(*p == NULL)
	{
		*p = malloc(sizeof(struct tree_node));

		if(*p != NULL)
		{
			(*p)->data = data;
			(*p)->lch = (*p)->rch = NULL;
		}
	}
	else if((*p)->data >= data)
	{
		insert_data_into_tree(&(*p)->lch, data);
	}
	else
	{
		insert_data_into_tree(&(*p)->rch, data);
	}
}

或者

struct tree_node *insert_data_into_tree(struct tree_node *p, int data)
{
	if(p == NULL)
	{
		p = malloc(sizeof(struct tree_node));

		if(p != NULL)
		{
			p->data = data;
			p->lch = p->rch = NULL;
		}
	}
	else if(p->data >= data)
	{
		p->lch = insert_data_into_tree(p->lch, data);
	}
	else
	{
		p->rch = insert_data_into_tree(p->rch, data);
	}

	return p;
}

二、统计代码

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define MAX_LINE_LEN 10240

// 二叉树结点定义
struct tree_node
{
	char *ip;
	int count;
	struct tree_node *lch;
	struct tree_node *rch;
};

// 插入IP数据到二叉排序树中
void insert_data_into_tree(struct tree_node **p, char *ip, int *ip_num)
{
	// 二叉排序树中没有该IP,插入。
	if(*p == NULL)
	{
		*p = malloc(sizeof(struct tree_node));

		if(*p != NULL)
		{
			(*p)->ip = malloc(sizeof(ip) + 1);
			strcpy((*p)->ip, ip);
			(*p)->count = 1;
			(*p)->lch = (*p)->rch = NULL;
			*ip_num = *ip_num + 1;
		}	
	}
	else
	{
		int ret = strcmp((*p)->ip, ip);

		// 二叉排序树中有该IP,将其count加1即可
		if(ret == 0)
		{
			(*p)->count++;
		}
		else if(ret > 0)
		{
			insert_data_into_tree(&(*p)->lch, ip, ip_num);
		}
		else
		{
			insert_data_into_tree(&(*p)->rch, ip, ip_num);
		}
	}
}

// 复制二叉树中的数据到数组中
void print_tree_to_array(struct tree_node *p, struct tree_node *ip_array, int *index)
{
	if(p == NULL) return;
	else
	{
		print_tree_to_array(p->lch, ip_array, index);
		print_tree_to_array(p->rch, ip_array, index);
		ip_array[*index] = *p;
		*index = *index + 1;
		free(p);
	}
}

// 比较函数
int is_greater(const void *p, const void *q)
{
	struct tree_node *a = (struct tree_node *)p;
	struct tree_node *b = (struct tree_node *)q;

	if(a->count < b->count) return 1;
	else if(a->count == b->count) return 0;
	else return -1;
}

int main(void)
{
	char buff[MAX_LINE_LEN];
	int line_num = 0;
	int ip_num = 0;
	struct tree_node *root = NULL;
	FILE *fp = fopen("access_2018-03-29.log", "r");

	while((fgets(buff, MAX_LINE_LEN, fp)) != NULL)
	{
		// 解析IP
		char *ip = strtok(buff, " ");
		// 将解析到的IP插入二叉排序树中
		insert_data_into_tree(&root, ip, &ip_num);
		line_num++;
	}
	fclose(fp);

	// 定义变长数组
	struct tree_node ip_array[ip_num];
	int index = 0;

	// 将二叉排序树中的数据复制到数组中,为排序做准备
	print_tree_to_array(root, ip_array, &index);

	// 快速排序,标准库函数
	qsort(ip_array, ip_num, sizeof(struct tree_node), is_greater);

	// 打印统计结果
	printf("IP统计\n");
	printf("-------------------------------\n");
	printf("访问次数\tIP\n");

	for(int i=0; i<20; i++)
	{
		printf("%d:\t\t%s\n", ip_array[i].count, ip_array[i].ip);
	}

	printf("-------------------------------\n");
	printf("IP总数:\t%d\n", ip_num);
	printf("访问次数:\t%d\n", line_num);

	return 0;
}

三、运行结果

[ycxie@fedora Workspace]$ gcc stat_log.c -o stat_log -Wall
[ycxie@fedora Workspace]$ ./stat_log
IP统计
-------------------------------
访问次数	IP
39580:		116.196.82.133
28216:		123.11.117.71
22028:		106.14.242.113
21915:		112.67.103.4
13640:		47.98.60.66
7929:		222.84.60.132
7341:		61.231.167.236
6326:		106.120.173.135
6281:		117.86.200.253
5804:		1.27.91.109
5157:		183.15.89.217
5093:		112.67.98.122
4927:		183.15.88.121
4606:		27.189.207.44
3369:		27.17.116.5
3308:		43.243.149.93
3108:		89.64.27.45
3058:		14.106.229.143
2823:		43.243.149.10
2791:		112.67.106.55
-------------------------------
IP总数:	36339
访问次数:	1569554

Leave a Reply

Your email address will not be published. Required fields are marked *