可乐博客

谈PythonGIL提升采集速度

gevent、multiprocessing、threadpool
在协程、多进程、线程池中关注的为常量、队列、锁
在采集过程中关注的为硬盘io、网络io、每秒请求量

@staticmethod#线程池
def pool(callback, lists,threadNum=10):
    import threadpool         
    pool = threadpool.ThreadPool(threadNum) 
    requests = threadpool.makeRequests(callback, lists) 
    [pool.putRequest(req) for req in requests] 
    pool.wait()


@staticmethod
def bPool(arg):
    from multiprocessing.dummy import Pool as ThreadPool # 线程池
    tpool = ThreadPool(arg['tnum'])
    arr=list(map(lambda i:{'cnum':arg['cnum'],'tnum':i,'arg':arg['arg']},range(arg['tnum'])))
    tpool.map(arg['callback'], arr)
    tpool.close()  
    tpool.join() 

@staticmethod#进程池
def sPool(callback,tnum=20,cnum='',arg=[]):
    from multiprocessing import Pool as ProcessPool # 进程池
    from multiprocessing import cpu_count #cpu数量
    if cnum=='':
        spool = ProcessPool(cpu_count())
    else:
        spool = ProcessPool(cnum)
    arr=list(map(lambda i:{'cnum':i,'tnum':tnum,'callback':callback,'arg':arg},range(cnum)))
    spool.map(fleader.bPool, arr)
    spool.close()  
    spool.join()

def Manager():
    from multiprocessing import Manager
    manager = Manager()
    q = manager.Queue()
    lock = manager.Lock()
    return q,lock

@staticmethod #gevent协程
def gPool(callback,urls=[],pnum=800):
    from gevent import monkey; monkey.patch_all(socket=True,select=True)
    from gevent.pool import Pool
    gpool = Pool(pnum)
    gpool.map(callback, urls)

@staticmethod#gevent常量
def getGevent():
    from gevent import monkey; monkey.patch_all(socket=True,select=True)
    from gevent.queue import Queue#get,put
    from gevent.local import local
    try:
        from gevent.lock import BoundedSemaphore
    except:
        from gevent.coros import BoundedSemaphore 
    sem = BoundedSemaphore(2)#acquire,release
    return local,Queue,sem#返回常量,队列,锁
Pythonic