# -*- coding : utf-8 -*- import os import re import glob import datetime from openpyxl import Workbook project_list = [re.compile(r'.*(少儿英语|开心鼠|(猿|圆|缘|元)(编程|变成|边城|边上)|英语.{0,10}(体验|启蒙|课程|课)|(网易|有道)|河小象|写字课|叫叫阅读).*'), re.compile(r'.*(阅读.{0,10}(兴趣|课程|启蒙|课)).*') ] re_dict1 = { '销售禁行': ['^((?!app|下载).)*(高途|高图|高徒).*(工作人员|客服)((?!app|下载).)*$'], } re_dict2 = { '客户冲突': [r'(垃圾|我操|傻逼|叫声|神经病|有病|卧槽|他妈的|贱人|智商|卑鄙无耻|衣冠禽兽|我丢你老母|屎|装疯卖傻|装傻充愣|疯子|王八|疯狗|乱咬人|听不懂人话|不说人话|猪狗不如|混蛋|没脑子|傻屌|去死吧|滚|TM|人渣|怂|妈逼|操你妈|妈逼|傻子|SB|sb)', r'(有人生没人养|不要脸|叫声妈|叫声爹|装纯洁|在东莞挣钱|有脸活|枉为人|噎着|臭|臭嘴|坑蒙拐骗|生下你|镜子|照照|三观不正|吃软饭|脑残|丧样|人话|个泡|装逼|牛逼|婊子|滚|不是人|贱|废话|丧失|嘴硬|三观不正|出息|没教养|上门女婿|装什么装|没素质|找小三|吃软饭|拿镜子照照|撒泡尿照照|几斤几两|蠢|不着调|屎|脑残|人渣|是不是人|吸毒|溜冰|贩毒|瘾君子|赢钱|赌场|毒品|大波|有种|犯贱|痴呆|败类|丢脸|脑残|人渣|天堂|脸皮厚)', r'(我是你.{1,5}(爸|妈|爷|奶))', r'^((?!.).)*妈的((?!.).)*$', r'(东莞.*母亲|东莞.*女儿|母亲.*东莞|女儿.*东莞|你母亲)', r'(穷|就缺.{0,5}(钱|报名费)|都不舍得)'], '弄虚作假': [r'^((?!不支持|不能|特价课).)*退费((?!不支持|不能|特价课).)*$', r'^((?!不合适).)买.{0,5}(((高|低)一年级)|下学期)((?!不合适).)*$', r'帮.{0,5}(亲戚|朋友|同事).{0,5}买|多买点|(亲戚|朋友|同事).{0,5}有需要吗|帮完成个任务'], '不当获利': [r'(红包|现金)?.*返利|(发|给).{0,5}(红包|现金)', ], '虚假承诺': [# r'^((?!不).)*(初三|高一|高二|高三|高中)?.*(再|可以).{0,5}(修改|更改|更换|调整|调课)((?!不).)*$', r'^((?!课后|辅导|检查).)*一对一.{0,5}(上课|教学|学习)((?!课后|辅导|检查).)*$', r'(增加).{0,5}课时', r'手机[^号].{0,5}[^不](可以|支持).{0,5}(上课|观看|回放|操作)'], '销售禁行': [r'(工资|待遇|几口人|身体好吗|吃饭了吗)', ], '服务态度': [], '服务专业': [r'[^不](需要|用).{0,10}(电脑|平板)', r'(电脑|平板).*(可以上课)'] } re_dict3 = { '询问平台': [r'(这个是|哪里|哪个|哪的|怎么办|什么(平台|东西|玩意))', ] } def recognition( conversation, repeat_time, call_id, customer_id, who, call_date, wav_path, ip, time, channel, phone, stop_sign ): wb = Workbook() ws = wb.active ws.append([call_id, customer_id, who, call_date, wav_path, phone]) sign_num = 0 sign_false = 0 sign_other_project = 0 sign_ask = False sign_tousu = False num_1, num_2 = 0, 0 type_dict = {'客户冲突': 0, '弄虚作假': 0, '不当获利': 0, '虚假承诺': 0, '销售禁行': 0, '服务态度': 0, '服务专业': 1, '询问平台': 0} if channel == 1: for i, c in enumerate(conversation): content, type = c try: content = process_queries(content) except Exception: pass # 客户话语 主动询问 pattern = re.compile(re_dict3['询问平台'][0]) result = pattern.search(content) if result: sign_ask = True ws.append([call_id, customer_id, '', '', '', '', '1_'+content]) continue # 客户话语 投诉意图 pattern = re.compile(r'(投诉|领导在哪)') result = pattern.search(content) if result: sign_tousu = True ws.append([call_id, customer_id, '', '', '', '', '1_'+content]) continue # 坐席话语 pattern = re.compile(re_dict1['销售禁行'][0]) result = pattern.search(content) if result and not sign_ask: ws.append( [call_id, customer_id, '', '{}错误'.format('销售禁行'), re_dict1['销售禁行'][0], result.group(), '1_'+content] ) sign_num = 1 sign_false = 1 for false_type in re_dict2: for re_rule in re_dict2[false_type]: pattern = re.compile(re_rule) result = pattern.search(content) if result: if false_type == '服务专业': type_dict[false_type] = 0 else: ws.append( [call_id, customer_id, '', '{}错误'.format(false_type), re_rule, result.group(), '1_'+content] ) sign_num = 1 sign_false = 1 if sign_false: sign_false = 0 continue ws.append([call_id, customer_id, '', '', '', '', '1_'+content]) elif channel == 2: conversation_1 = list() conversation_2 = list() for c in conversation: if c[-1] == 1: conversation_1.append( c[0] ) else: for word in ['留言', '正在通话中']: if word in c[0]: conver_list = [call_id, customer_id, who, call_date, '0', ''] for c in conversation: conver_list.append(str(c[-1]) + '_' + c[0]) ws.append(conver_list) wb.save('./data_dir/{}.xlsx'.format()) return 0 conversation_2.append( c ) for i, c in enumerate(conversation): content, type, start_time, end_time = c content = process_queries(content) if type == 1: num_1 += 1 for project_name in project_list: result = project_name.search(content) if result: sign_other_project = 1 pattern = re.compile(re_dict1['销售禁行'][0]) result = pattern.search(content) if result and not sign_ask: ws.append( [call_id, customer_id, '', '{}错误'.format('销售禁行'), re_dict1['销售禁行'][0], result.group(), '1_'+content] ) sign_num = 1 sign_false = 1 for false_type in re_dict2: for re_rule in re_dict2[false_type]: pattern = re.compile(re_rule) result = pattern.search(content) if result: if false_type == '服务专业': type_dict[false_type] = 0 elif re_rule == r'手机[^号].{0,5}[^不](可以|支持).{0,5}(上课|观看|回放|操作)' \ and sign_other_project: continue else: ws.append( [call_id, customer_id, '', '{}错误'.format(false_type), re_rule, result.group(), '1_'+content] ) sign_num = 1 sign_false = 1 if sign_false: sign_false = 0 continue ws.append([call_id, customer_id, '', '', '', '', '1_'+content]) else: num_2 += 1 # 客户主动询问 pattern = re.compile(r'(哪的|什么(平台|东西|玩意))') result = pattern.search(content) if result: sign_ask = True # 投诉意图 pattern = re.compile(r'(投诉|领导在哪)') result = pattern.search(content) if result: sign_tousu = True ws.append([call_id, customer_id, '', '', '', '', '2_'+content]) for key in type_dict: if type_dict[key]: ws.append([call_id, customer_id, '', '{}错误'.format(key), '', '', '']) if not os.path.exists('./{}_data_dir/'.format(ip)): os.mkdir('./{}_data_dir/'.format(ip)) date = '{}-{}-{}'.format(datetime.datetime.now().year, datetime.datetime.now().month, datetime.datetime.now().day) if glob.glob('./{}_data_dir/*.xlsx'.format(ip)) and\ not glob.glob('./{}_data_dir/*.xlsx'.format(ip))[-1].split('/')[-1].startswith(date): for file_path in glob.glob('./{}_data_dir/*.xlsx'.format(ip)): os.remove(file_path) wb.save('./{}_data_dir/'.format(ip) + '{}_{}_{}_{}.xlsx'.format(date, customer_id, sign_num, time)) return sign_num def process_queries(current_query): pass_word = ['不需要', '不用了', '不用', '一点', '考虑', '谢', '在', '看', '问', '想', '天', '刚', '试'] remove_word = [ '不好意思', '对不起', '谢谢你', '哎呀', '你好', '您好', '再见', '谢谢', '好吧', '感谢', '呃', '啊', '哦', '嘞', '喂', '哎', '哈', '哟', '哇', '呦', '拜' '嗯,', '嗯。', '嗯', ',好,', '。好,', ',行,', ',行。', ',对,', ',对。'] for remove in remove_word: if len(current_query.replace(remove, '')) > 2: current_query = current_query.replace(remove, '') # 删除相近相同词 sign = True while sign: i = 0 sign = False record_list = list() final_str = list() while i < len(current_query): repeat_num = 0 for j in range(1, 20): target = current_query[i:i + j] while True: if target == current_query[i + j * (repeat_num + 1):i + j * (repeat_num + 2)]: repeat_num += 1 else: break if repeat_num: record_append = (i, j, repeat_num) break if repeat_num: if target in pass_word or not (u'\u4e00' <= target <= u'\u9fff'): final_str.append(target * 2) else: final_str.append(target) sign = True record_list.append(target) i = i + j * (repeat_num + 1) continue final_str.append(current_query[i]) i += 1 current_query = ''.join(final_str) # 将被标点符号隔开的相同字合并在一块 current_query_result = '' for i in range(len(current_query)): if current_query[i] in [',', '。', '?', '!']: for j in range(1, 5): if current_query[i - j if i - j >= 0 else 0:i] == current_query[i + 1:i + j + 1]: current_query_result = current_query_result[:-1 * j] break current_query_result += current_query[i] current_query = current_query_result for point_sign in ['?。', '?,', ',。', ',,', '。,', '。。']: current_query = current_query.replace(point_sign, ',') if current_query[0] in [',', '。']: current_query = current_query[1:] return current_query if __name__ == '__main__': pass