# -*- coding : utf-8 -*- import os import re import glob import json import logging import datetime import requests from openpyxl import Workbook project_list = [re.compile(r'.*(少儿英语|开心鼠|(猿|圆|缘|元)(编程|变成|边城|边上)|英语.{0,10}(体验|启蒙|课程|课)|(网易|有道)|河小象|写字课|叫叫阅读).*'), re.compile(r'.*(阅读.{0,10}(兴趣|课程|启蒙|课)).*') ] re_dict1 = { '销售禁行': [r'^((?!app|下载).)*(高途|高图|高徒).*(工作人员|客服)((?!app|下载).)*$'], '服务专业': [r'(企微|企业微信|管方微信|服务通知)'] } re_dict2 = { '客户冲突': [ r'^((?!动).)*滚((?!动).)*$', r'(垃圾|我操|傻逼|叫声|神经病|有病|卧槽|他妈的|贱人|智商|卑鄙无耻|衣冠禽兽|我丢你老母|屎|装疯卖傻|装傻充愣|疯子|王八|疯狗|乱咬人|听不懂人话|不说人话|猪狗不如|混蛋|没脑子|傻屌|去死吧|TM|人渣|怂|妈逼|操你妈|妈逼|傻子|SB|sb)', r'(有人生没人养|不要脸|叫声妈|叫声爹|装纯洁|在东莞挣钱|有脸活|枉为人|噎着|臭|臭嘴|坑蒙拐骗|生下你|镜子|照照|三观不正|吃软饭|脑残|丧样|人话|个泡|装逼|牛逼|婊子|不是人[^工]|贱|废话|丧失|嘴硬|三观不正|出息|没教养|上门女婿|装什么装|没素质|找小三|吃软饭|拿镜子照照|撒泡尿照照|几斤几两|蠢|不着调|屎|脑残|人渣|是不是人|吸毒|溜冰|贩毒|瘾君子|赢钱|赌场|毒品|大波|有种|犯贱|痴呆|败类|丢脸|脑残|人渣|天堂|脸皮厚)', r'(我是你.{1,5}(爸|妈|爷|奶))', r'^((?!.).)*妈的((?!.).)*$', r'(东莞.*母亲|东莞.*女儿|母亲.*东莞|女儿.*东莞|你母亲)', r'(穷|就缺.{0,5}(钱|报名费)|都不舍得|报名.{0,5}怎么了)'], '弄虚作假': [r'^((?!不支持|不能|特价课).)*退费((?!不支持|不能|特价课).)*$', r'^((?!不合适).)买.{0,5}(((高|低)一年级)|下学期)((?!不合适).)*$', r'帮.{0,5}(亲戚|朋友|同事).{0,5}买|多买点|(亲戚|朋友|同事).{0,5}有需要吗|帮完成个任务'], '不当获利': [r'(红包|现金)?.*返利|(发|给).{0,5}(红包|现金)', ], '虚假承诺': [ r'非凡.{0,10}(均|也).{0,10}(能|可)使用', r'不限.{0,10}樊登.{0,10}使用', r'^((?!(2年|两年|赠送一年|买一年送一年)).)*年卡.{0,50}(有效期|能使用|可用)((?!(2年|两年|赠送一年|买一年送一年)).)*$', ], #r'(多|2|两|几)台.{0,10}同时.{0,10}(登陆|登录)|同时.{0,10}(多|2|两|几)台.{0,20}(登陆|登录)'], # 2021年10月25号 '销售禁行': [r'(工资|你的.{0,5}待遇|几口人|身体好吗|吃饭了吗)', r'(您|你|我).{0,5}(微信|号码|地址|住址|住的地方).{0,5}(是多少|在哪)', ], # 不询问用户电话号码和微信号 '服务态度': [], '服务专业': [r'(加.{0,5}我.{0,5}[^企业]微信)', ] } re_dict3 = { '询问平台': [r'(这个是|哪里|哪个|哪的|怎么办|什么(平台|东西|玩意))', ] } def recognition(conversation, repeat_time, call_id, customer_id, who, call_date, wav_path, ip, time, channel, phone, stop_sign): wb = Workbook() ws = wb.active ws.append([call_id, customer_id, who, call_date, wav_path, phone]) sign_num = 0 sign_false = 0 sign_guanfangweixin = 0 sign_jiaweixin = 0 sign_ask = False sign_tousu = False num_1, num_2 = 0, 0 dialogue_list = list() type_dict = {'客户冲突': 0, '弄虚作假': 0, '不当获利': 0, '虚假承诺': 0, '销售禁行': 0, '服务态度': 0, '服务专业': 0, '询问平台': 0} if channel == 1: for i, c in enumerate(conversation): content, type = c try: content = process_queries(content) except Exception: pass # 客户话语 主动询问 pattern = re.compile(re_dict3['询问平台'][0]) result = pattern.search(content) if result: sign_ask = True dialogue_list.append([call_id, customer_id, '', '', '', '', '1_' + content]) continue # 客户话语 投诉意图 pattern = re.compile(r'(投诉|领导在哪)') result = pattern.search(content) if result: sign_tousu = True dialogue_list.append([call_id, customer_id, '', '', '', '', '1_' + content]) continue # 坐席话语 pattern = re.compile(re_dict1['销售禁行'][0]) result = pattern.search(content) if result and not sign_ask: dialogue_list.append([call_id, customer_id, '', '{}错误'.format('销售禁行'), re_dict1['销售禁行'][0], result.group(), '1_' + content]) sign_guanfangweixin = 1 sign_num = 1 sign_false = 1 for false_type in re_dict2: for re_rule in re_dict2[false_type]: pattern = re.compile(re_rule) result = pattern.search(content) if result: if false_type == '服务专业': type_dict[false_type] = 0 else: dialogue_list.append([call_id, customer_id, '', '{}错误'.format(false_type), re_rule, result.group(), '1_' + content]) sign_num = 1 sign_false = 1 if sign_false: sign_false = 0 continue dialogue_list.append([call_id, customer_id, '', '', '', '', '1_' + content]) elif channel == 2: conversation_1 = list() conversation_2 = list() for c in conversation: if c[1] == 1: conversation_1.append(c) else: for word in ['留言', '正在通话中']: if word in c[0]: conver_list = [call_id, customer_id, who, call_date, '0', ''] for c in conversation: conver_list.append(str(c[1]) + '_' + c[0]) dialogue_list.append(conver_list) if not os.path.exists('./{}_data_dir/'.format(ip)): os.mkdir('./{}_data_dir/'.format(ip)) wb.save('./{}_data_dir/{}.xlsx'.format(ip, call_id)) return 0 conversation_2.append(c) sign_num = all_recognition('开场', call_id, customer_id, conversation_1, conversation_2, repeat_time, dialogue_list, type_dict, sign_num, time) for i, c in enumerate(conversation): content, type, start_time, end_time = c content = process_queries(content) if type == 1: '''num_1 += 1 for project_name in project_list: result = project_name.search(content) if result: sign_other_project = 1''' # 常规规则识别 for false_type in re_dict2: for re_rule in re_dict2[false_type]: pattern = re.compile(re_rule) result = pattern.search(content) if result: dialogue_list.append([call_id, customer_id, '', '{}错误'.format(false_type), re_rule, result.group(), '1_' + content]) type_dict[false_type] = 1 sign_num = 1 sign_false = 1 # 判断用户询问所属机构时,是否及时回复 pattern = re.compile(re_dict1['销售禁行'][0]) result = pattern.search(content) if result and not sign_ask: dialogue_list.append([call_id, customer_id, '', '{}错误'.format('销售禁行'), re_dict1['销售禁行'][0], result.group(), '1_' + content]) sign_num = 1 sign_false = 1 # 判断用户加微信是否是使用的官方微信号 pattern = re.compile(re_dict1['服务专业'][0]) result = pattern.search(content) if result: sign_guanfangweixin = 1 if sign_false: sign_false = 0 continue dialogue_list.append([call_id, customer_id, '', '', '', '', '1_' + content]) else: num_2 += 1 # 使用线上意图模型识别意图 intent = intent_judge(content, customer_id, ip, 'nlu获取意图错误') if intent and intent == '什么平台': sign_ask = True elif intent and intent == '投诉': sign_tousu = True if sign_jiaweixin: if intent and intent.startswith('不需要'): sign_jiaweixin = 0 sign_guanfangweixin = 1 # 客户主动询问 pattern = re.compile(r'(哪的|什么(平台|东西|玩意))') result = pattern.search(content) if result: sign_ask = True # 投诉意图 pattern = re.compile(r'(投诉|领导在哪)') result = pattern.search(content) if result: sign_tousu = True dialogue_list.append([call_id, customer_id, '', '', '', '', '2_' + content]) sign_num = all_recognition('结束', call_id, customer_id, conversation_1, conversation_2, repeat_time, dialogue_list, type_dict, sign_num, time) # 最终 for key in type_dict: if type_dict[key]: dialogue_list.append([call_id, customer_id, '', '{}错误'.format(key), '', '', '']) # 判断坐席说加微的时候到底是企业微信还是自己的微信 sign_num = 0 for i in range(len(dialogue_list)): if dialogue_list[i][3] == '服务专业错误' and sign_guanfangweixin: dialogue_list[i][3] = '' dialogue_list[i][4] = '' dialogue_list[i][5] = '' dialogue_list[i][6] = '' elif dialogue_list[i][3]: sign_num = 1 ws.append(dialogue_list[i]) if not os.path.exists('./{}_data_dir/'.format(ip)): os.mkdir('./{}_data_dir/'.format(ip)) date = '{}-{}-{}'.format(datetime.datetime.now().year, datetime.datetime.now().month, datetime.datetime.now().day) if glob.glob('./{}_data_dir/*.xlsx'.format(ip)) and \ not glob.glob('./{}_data_dir/*.xlsx'.format(ip))[-1].split('/')[-1].startswith(date): for file_path in glob.glob('./{}_data_dir/*.xlsx'.format(ip)): os.remove(file_path) wb.save('./{}_data_dir/'.format(ip) + '{}_{}_{}_{}.xlsx'.format(date, customer_id, sign_num, time)) return sign_num def all_recognition(type, call_id, customer_id, conversation_1, conversation_2, repeat_time, finnal_list, type_dict, sign_num, time): # 对区域性话语进行判断,开场段,产介段,结束段。 if type == '开场': # 开场空挂判断 if conversation_1 and conversation_1[0][2] > 5: # sign_num = 1 finnal_list.append([call_id, customer_id, '', '销售禁行', '开场空挂时长超过5秒', conversation_1[0][2], conversation_1[0][3]]) elif type == '结束': # 结束空挂判断 if conversation_1 and (time - conversation_1[-1][3]) > 5: # sign_num = 1 finnal_list.append([call_id, customer_id, '', '销售禁行', '结束空挂时长超过5秒', '', '']) for i, re_time in enumerate(repeat_time): if re_time > 5: # sign_num = 1 finnal_list.append([call_id, customer_id, '', '第{}段交互重复时长超过5秒'.format(i), '', '']) return sign_num def intent_judge(query, customer_id, ip, text): url = 'http://47.92.230.239:8679/nlu?session_id=-1&workspace=222¤t_query={}'.format(str(query)) try: response = requests.get(url) result = json.loads(response.text) intention = result['intention'] if intention and intention[0].get('value', ''): return intention[0]['value']['standard_query'] else: return '' except Exception: pass # send_wechat_warning(customer_id, ip, text) return '' def process_queries(current_query): pass_word = ['不需要', '不用了', '不用', '一点', '考虑', '谢', '在', '看', '问', '想', '天', '刚', '试'] remove_word = [ '不好意思', '对不起', '谢谢你', '哎呀', '你好', '您好', '再见', '谢谢', '好吧', '感谢', '呃', '啊', '哦', '嘞', '喂', '哎', '哈', '哟', '哇', '呦', '拜' '嗯,', '嗯。', '嗯', ',好,', '。好,', ',行,', ',行。', ',对,', ',对。'] for remove in remove_word: if len(current_query.replace(remove, '')) > 2: current_query = current_query.replace(remove, '') # 删除相近相同词 sign = True while sign: i = 0 sign = False record_list = list() final_str = list() while i < len(current_query): repeat_num = 0 for j in range(1, 20): target = current_query[i:i + j] while True: if target == current_query[i + j * (repeat_num + 1):i + j * (repeat_num + 2)]: repeat_num += 1 else: break if repeat_num: record_append = (i, j, repeat_num) break if repeat_num: if target in pass_word or not (u'\u4e00' <= target <= u'\u9fff'): final_str.append(target * 2) else: final_str.append(target) sign = True record_list.append(target) i = i + j * (repeat_num + 1) continue final_str.append(current_query[i]) i += 1 current_query = ''.join(final_str) # 将被标点符号隔开的相同字合并在一块 current_query_result = '' for i in range(len(current_query)): if current_query[i] in [',', '。', '?', '!']: for j in range(1, 5): if current_query[i - j if i - j >= 0 else 0:i] == current_query[i + 1:i + j + 1]: current_query_result = current_query_result[:-1 * j] break current_query_result += current_query[i] current_query = current_query_result for point_sign in ['?。', '?,', ',。', ',,', '。,', '。。']: current_query = current_query.replace(point_sign, ',') if current_query[0] in [',', '。']: current_query = current_query[1:] return current_query if __name__ == '__main__': pass