# -*- coding : utf-8 -*- import time import tqdm import json import pymysql import requests from openpyxl import load_workbook, Workbook def nlu_result(workspace, query): update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(round(time.time() * 1000)) / 1000)) response = requests.get( 'http://8.142.85.77:8680/nlu?session_id=-1&workspace={}¤t_query={}'.format(workspace, query)) # 'http://8.142.85.77:8670/nlu?session_id=-1&workspace={}¤t_query={}'.format(workspace, query)) response = json.loads(response.text) if workspace == '210': slot = list() if response.get('intention', ''): if response.get('intention', '')[0].get('value', ''): intent = response.get('intention', '')[0].get('value', '') # print(query, intent, type(intent)) q_type = intent['qtype'] answer = intent['a'] query = intent['original_query'] intent = intent['standard_query'] return [q_type, intent, query, answer, update_time] else: q, i, s = faq_result(workspace, query) return ['默认分类', '{}_{}'.format(i, q), query, 'NOINTENT', update_time] elif response.get('slot', ''): for slots in response['slot']: slot.append(slots['slot_value'][0]) return ['默认类型', ' '.join(slot), query, '', update_time] else: if response.get('intention', '') and response['intention'][0].get('value', ''): intent = response.get('intention', '')[0].get('value', '') # print( query, intent, type(intent) ) q_type = intent['qtype'] answer = intent['a'] query = intent['original_query'] intent = intent['standard_query'] return [q_type, intent, query, answer, update_time] else: q, i, s = faq_result(workspace, query) return ['默认分类', '{}_{}'.format(i, q), query, 'NOINTENT', update_time] def faq_result(workspace, query): response = requests.get( # 'http://8.142.85.77:8454/level_search?systemId={}&query={}'.format(workspace, query)) 'http://8.142.85.77:8456/level_search?systemId={}&query={}'.format(workspace, query)) response = json.loads(response.text) if response['ch']: query = response['ch'][0]['original_query'] intent = response['ch'][0]['standard_query'] semantic = response['ch'][0]['semantic'] return query, intent, semantic elif response['h']: query = response['h'][0]['original_query'] intent = response['h'][0]['standard_query'] semantic = response['h'][0]['semantic'] return query, intent, semantic return '', '', '' def process_queries(current_query): pass_word = [ # '不需要了', '不需要', '不用了', '不用', '不要', '不要了', '一点', '考虑', '谢', '在', '看', '问', '想', '天', '刚', '试', '拜'] ''' remove_word = ['嗯,', '嗯。', '嗯', ',好,', '。好,', '。好。', ',行,', ',行。', '。行。', ',对,', ',对。', '。对。'] for remove in remove_word: if len( current_query.replace(remove, '') ) > 2: current_query = current_query.replace(remove, '') ''' # 删除相近相同词 sign = True while sign: i = 0 sign = False record_list = list() final_str = list() while i < len(current_query): repeat_num = 0 for j in range(1, 7): target = current_query[i:i + j] while True: if target == current_query[i + j * (repeat_num + 1):i + j * (repeat_num + 2)]: repeat_num += 1 else: break if repeat_num: record_append = (i, j, repeat_num) break if repeat_num: if target in pass_word or not (u'\u4e00' <= target <= u'\u9fff'): final_str.append(target * 2) else: final_str.append(target) sign = True record_list.append(target) i = i + j * (repeat_num + 1) continue final_str.append(current_query[i]) i += 1 current_query = ''.join(final_str) try: # 去除"好","行" current_query_list = list() for i in range(len(current_query)): if current_query[i] in ['好', '行', '对']: if i == 0: if current_query[i+1] == ',': continue elif i == len(current_query) - 1: if current_query[i-1] == ',': continue current_query_list.append(current_query[i]) if len(current_query_list) > 2: current_query = ''.join(current_query_list) except Exception: pass # 将被标点符号隔开的相同字合并在一块 current_query_result = '' for i in range(len(current_query)): if current_query[i] in [',', '。', '?', '!']: for j in range(1, 5): if current_query[i - j if i - j >= 0 else 0:i] == current_query[i + 1:i + j + 1]: current_query_result = current_query_result[:-1 * j] break current_query_result += current_query[i] current_query = current_query_result for point_sign in ['?。', '?,', ',。', ',,', '。,', '。。', ',,']: current_query = current_query.replace(point_sign, ',') if current_query[0] in [',', '。']: current_query = current_query[1:] return current_query def rm_stop_word(query): stop_word = ['quiet', '@', '不好意思', '对不起', '好谢谢', '谢谢您', '谢谢你', '拜拜', '谢谢', '好吧', '好嘞', '你好', '您好', '然后', '抱歉', '再见', '再见', '受累', '姐夫', '嫂子', '大哥', '老妹', '兄弟', '美女', '谢', '哦', '啊', '嘞', '喂', '啦', '唉', '哎', '哥', '姐', '哈', '呐', '呃', '噢', '诶', '噢', '唔', '呢', '呀', '嗯,', '嗯。', '嗯', ',好,', '。好,', '。好。', ',行,', ',行。', '。行。', ',对,', ',对。', '。对。', ] if query == "@@quiet@@": return query for stop in stop_word: if len(query.replace(stop, '').replace(',', '').replace(',', '').replace('。', '').replace('?', '')) > 0: query = query.replace(stop, '') # query = query.replace(stop, '') for start_word in ['好,', '好。', '行,', '行。', '对,', '对。']: if query.startswith(start_word) and len(query[2:].replace(',', '').replace('。', '').replace('?', '')) >= 2: query = query[2:] return query def nlu_task(content, num, ws_w): try: content = rm_stop_word(content) content = process_queries(content) content = rm_stop_word(content) except Exception: pass content_list = content.replace('。', ',').split(',') # print(content_list) final_list = list() for i, content in enumerate(content_list): if i != 0: if len(final_list[-1]) < 4: final_list[-1] = final_list[-1] + content else: final_list.append(content) else: final_list.append(content) intent_set = set() if len(final_list) > num: for i in range(len(final_list)-num+1): content = ','.join(final_list[i:i+num]) print(content) result = nlu_result('246', content) # result = nlu_result('250', content) print( result ) if '_' not in result[1]: print( result[1] ) ws_w.append([result[0], result[1], content, result[2], result[3]]) intent_set.add(result[1]) else: ws_w.append([result[0], 'NOINTENT', content, result[2], result[3]]) else: content = ','.join(final_list) print( content ) result = nlu_result('246', content) # result = nlu_result('250', content) print( result ) if '_' not in result[1]: print(result[1]) ws_w.append([result[0], result[1], content, result[2], result[3]]) intent_set.add(result[1]) else: ws_w.append([result[0], 'NOINTENT', content, result[2], result[3]]) print( intent_set ) return intent_set def task(): wb = load_workbook('./还呗人人对话3.24.xlsx') wb_w = Workbook() ws_w = wb_w.active intent_dict = dict() for i in [2, 3]: ws = wb[wb.sheetnames[i]] for i, row in tqdm.tqdm(enumerate(ws.values)): if i != 0 and row[1] == '坐席': query = row[2] intent = row[3] if row[4]: intent = row[4] # print( row[3], row[4], intent ) if intent in ['操作流程:银行卡信息', ]: # if intent and intent != 'NOINTENT': print(query) if intent not in intent_dict: intent_dict[intent] = [0, 0] intent_list = nlu_task(query, 5) if intent in ['是否打开APP', '是否同意在线协助操作']: if '是否打开APP' in intent_list or '是否同意在线协助操作' in intent_list: intent_dict[intent][1] += 1 elif intent in ['产品介绍', '是否同意注册查看额度']: if '产品介绍' in intent_list or '是否同意注册查看额度' in intent_list: intent_dict[intent][1] += 1 elif intent in intent_list: # print(1) intent_dict[intent][1] += 1 intent_dict[intent][0] += 1 all_num = 0 all_true = 0 all_false = 0 ws_w = wb.create_sheet('出现标签准确率') ws_w.append(['意图名称', '是否出现/正确率', '出现次数', '正确数']) for keys in intent_dict: try: all_num += intent_dict[keys][0] all_true += intent_dict[keys][1] ws_w.append( [keys, (intent_dict[keys][1] / intent_dict[keys][0]) * 100, intent_dict[keys][0], intent_dict[keys][1]]) print('{},{}%,出现总数:{},正确数:{}'.format(keys, (intent_dict[keys][1] / intent_dict[keys][0]) * 100, intent_dict[keys][0], intent_dict[keys][1])) except Exception: # ws.append( [keys, '未出现', out] ) # print( keys, '未出现' ) pass print('整体准确率:{}'.format(all_true / all_num * 100)) ws_w.append(['整体准确率', all_true / all_num * 100, all_num, all_true]) wb_w.save('./金融准确率.xlsx') def get_data_from_datebase(): connection = pymysql.connect(host="47.92.193.147", port=3306, user="root", passwd="Moxi123#", db="task_dialogue_config", charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() sql = 'select distinct call_id ' \ 'from human_dialogue ' \ 'where company_id = {} ' \ 'and call_start_time > "{} 09:00:00" ' \ 'and call_start_time < "{} 23:59:59"; ' sql_id = 'select call_id, content_text, identity, workspace_id, company_id, call_start_time ' \ 'from human_dialogue ' \ 'where company_id = {} ' \ 'and call_start_time > "{} 09:00:00" ' \ 'and call_start_time < "{} 23:59:59"; ' wb = Workbook() ws = wb.active ws.append(['session_id', '角色', '内容']) cursor.execute(sql_id.format('2141', '2022-03-15', '2022-03-18')) results = cursor.fetchall() print( len(results) ) call_id_dict = dict() num = 0 for result in tqdm.tqdm(results): num += 1 call_id = result['call_id'] content = result['content_text'] identity = result['identity'] # print( type(identity) ) intent = '' intent_query = '' if identity == 1: res = nlu_result('304', content) if '_' not in res[1]: intent = res[1] intent_query = res[2] '''if call_id not in call_id_dict: call_id_dict[call_id] = list() call_id_dict[call_id].append([content, identity])''' ws.append([call_id, identity, content, intent, intent_query]) if num % 2000 == 0: wb.save('./轻舟纯人数据.xlsx') wb.save('./轻舟纯人数据.xlsx') def total_dialogue_test(): wb_w = Workbook() ws_w = wb_w.active wb = load_workbook('./2022-5-6_金条质检纯人违规.xlsx') # wb = load_workbook('./金条纯人.xlsx') ws = wb[wb.sheetnames[0]] customer_dict = dict() for i, row in tqdm.tqdm(enumerate(ws.values)): # print( row ) in ['K605690650414041922010'] \ if i != 0 and row[1] and row[2] == 1: # if i != 0 and row[0] in ['K622801142150074225144'] \ # and row[1].startswith('1'): print(row) content = row[1] customer_id = row[0] intent = row[2] if customer_id not in customer_dict: customer_dict[customer_id] = 0 intent_list = nlu_task(content, 3, ws_w) # print( '识别语句:{}'.format(row[1]) ) # print( intent_list ) '''if '免息券使用举例:不合理' in intent_list: print( '免息券使用举例:不合理', customer_id, ) print( row[1] ) print( intent_list ) customer_dict[customer_id] = 1 if '免息券使用举例:合理' in intent_list: print( '免息券使用举例:合理', row[1] ) print( intent_list ) customer_dict[customer_id] = 1''' print( customer_id, intent ) wb_w.save('./质检标注.xlsx') if __name__ == '__main__': wb = Workbook() ws = wb.active content = '嗯,先生咱们这个首页激活呢,是只需要。嗯几块钱的线生也是非常划算的,然后六百万的保障先上,然后先打开网址看一下你这个保单好吧先生。' intent_list = nlu_task(content, 3, ws) # total_dialogue_test() # task() # get_data_from_datebase() ''' print('########################################################') ws_2 = wb.create_sheet('未识别标签准确率') ws_2.append(['意图名称', '未识别次数']) for keys in intent_false_dict: try: all_false += intent_false_dict[keys][0] ws_2.append([keys, intent_false_dict[keys][0]]) print('{},未识别次数:{}'.format(keys, intent_false_dict[keys][0])) except Exception: # ws.append( [keys, '未出现', out] ) # print( keys, '未出现' ) pass print('召回率:{}'.format(all_num / (all_num + all_false) * 100)) ws_2.append(['召回率', all_num / (all_num + all_false) * 100]) '''