Python QQ纯真库解析

QQWry.Dat不是普通的txt文件,所以python解析要特定的程序。下面是一个Python类

#!/usr/bin/python
# -*- coding: UTF-8 -*-

# author : firefoxbug
# E-Mail : wanghuafire@gmail.com
# Blog   : www.firefoxbug.net

import sys
import socket
from struct import pack, unpack

class IPInfo(object):
    '''QQWry.Dat数据库查询功能集合
    '''
    def __init__(self, dbname):

        self.dbname = dbname
        f = file(dbname, 'r')
        self.img = f.read()
        f.close()

        (self.firstIndex, self.lastIndex) = unpack('II', self.img[:8])
        self.indexCount = (self.lastIndex - self.firstIndex) / 7 + 1

    def getString(self, offset = 0):
        o2 = self.img.find('\0', offset)
        gb2312_str = self.img[offset:o2]
        try:
            utf8_str = unicode(gb2312_str,'gb2312').encode('utf-8')
        except:
            return '未知'
        return utf8_str

    def getLong3(self, offset = 0):
        s = self.img[offset: offset + 3]
        s += '\0'
        return unpack('I', s)[0]

    def getAreaAddr(self, offset = 0):
        ''' 通过给出偏移值,取得区域信息字符串,'''

        byte = ord(self.img[offset])
        if byte == 1 or byte == 2:
            p = self.getLong3(offset + 1)
            return self.getAreaAddr(p)
        else:
            return self.getString(offset)

    def getAddr(self, offset, ip = 0):
        img = self.img
        o = offset
        byte = ord(img[o])

        if byte == 1:
            return self.getAddr(self.getLong3(o + 1))

        if byte == 2:
            cArea = self.getAreaAddr(self.getLong3(o + 1))
            o += 4
            aArea = self.getAreaAddr(o)
            return (cArea, aArea)

        if byte != 1 and byte != 2:

            cArea = self.getString(o)
            o = self.img.find('\0',o) + 1
            aArea = self.getString(o)
            return (cArea, aArea)

    def find(self, ip, l, r):
        ''' 使用二分法查找网络字节编码的IP地址的索引记录'''
        if r - l <= 1:
            return l

        m = (l + r) / 2
        o = self.firstIndex + m * 7
        new_ip = unpack('I', self.img[o: o+4])[0]
        if ip <= new_ip:
            return self.find(ip, l, m)
        else:
            return self.find(ip, m, r)

    def getIPAddr(self, ip):
        ip = unpack('!I', socket.inet_aton(ip))[0]
        i = self.find(ip, 0, self.indexCount - 1)
        o = self.firstIndex + i * 7

        o2 = self.getLong3(o + 4)

        (c, a) = self.getAddr(o2 + 4)
        return (c, a)

    def output(self, first, last):
        for i in range(first, last):
            o = self.firstIndex +  i * 7
            ip = socket.inet_ntoa(pack('!I', unpack('I', self.img[o:o+4])[0]))
            offset = self.getLong3(o + 4)
            (c, a) = self.getAddr(offset + 4)
            print "%s %d %s/%s" % (ip, offset, c, a)

'''search ip and its'localtion '''
def search_ip(ip_instance,ipaddr):
    if ipaddr :
        try :
            (c, a) = ip_instance.getIPAddr(ipaddr)
            print '%s/%s'%(c,a)
        except :
            print 'Unknow/Unknow'

if __name__ == '__main__':
    ip_instance = IPInfo('./QQWry.Dat')
    search_ip(ip_instance,"8.8.8.8")

纯真库优势就是可以精确到公司,街道,网吧,但是有麻烦的一点就是统计起来不方便,因为都是一个字符串,需要再做分词。

标签:Linux, Python

评论已关闭