# -*- coding : utf-8 -*- import re import io import os import tqdm import json import copy import random import openai import pymysql import requests import threadpool import uuid # from .util_tool import utils from treelib import Tree from openpyxl import load_workbook, Workbook from concurrent.futures import ThreadPoolExecutor dialogue_prompt = '''任务的要求为: {} 任务的运转逻辑是: {} 任务的可选择话术为: {} 你需要记住以下几点: 1.当任务的可选择话术为空时,判断任务已完成。 2.当任务的可选择话术不为空,根据对话记录,任务的要求判断任务是否完成。 3.当判断任务已完成,输出"任务完成。"。 4.当判断任务未完成,必须只能在可选择话术中选择一句话,选择的逻辑参考"任务的运转逻辑"和"任务的要求"。输出可选择话术中选择的那句话。 输入:''' nlu_prompt = '''任务的要求为: {} 任务的运转逻辑是: {} 你需要记住以下几点: 1.根据输入的对话记录,分析最后一次用户的表述的含义,给用户表述的含义打一个标签并给出理由。 输入:''' def generate_uuid(): return str(uuid.uuid4()) def _make_w_io_base(f, mode: str): if not isinstance(f, io.IOBase): f_dirname = os.path.dirname(f) if f_dirname != "": os.makedirs(f_dirname, exist_ok=True) f = open(f, mode=mode) return f def _make_r_io_base(f, mode: str): if not isinstance(f, io.IOBase): f = open(f, mode=mode) return f def jdump(obj, f, mode="w", indent=4, default=str): """Dump a str or dictionary to a file in json format. Args: obj: An object to be written. f: A string path to the location on disk. mode: Mode for opening the file. indent: Indent for storing json dictionaries. default: A function to handle non-serializable entries; defaults to `str`. """ f = _make_w_io_base(f, mode) if isinstance(obj, (dict, list)): json.dump(obj, f, indent=indent, default=default, ensure_ascii=False) elif isinstance(obj, str): f.write(obj) else: raise ValueError(f"Unexpected type: {type(obj)}") f.close() def jload(f, mode="r"): """Load a .json file into a dictionary.""" f = _make_r_io_base(f, mode) jdict = json.load(f) f.close() return jdict # 话术选择训练,测试数据生成 def bot_document_to_dialogue(file_path): wb = load_workbook(file_path) ws = wb[wb.sheetnames[0]] FAQ_off_list, FAQ_name_list, FAQ_answer_dict = bot_document_to_faq(file_path) task_transfer_pattern = re.compile(r'【.*】') task_transfer = dict() task_dict = dict() task_name = '' slot_id = 1 for i, row in enumerate(ws.values): task_name = row[0] if row[0] else task_name if task_name == '流程文档结束': break task_finish_logic = row[1] task_run_logic = row[2] task_stage = str(row[3]) task_condition = row[4] task_wav_no = row[5] task_seat_response = row[6] task_simple_seat_response = row[7].replace('【faq答案】+', 'faq') task_label = '['+row[9]+']' if row[9] else '' task_response_label = '[' + row[8] + ']' if row[8] else '' task_break_label = row[13].replace('开场支持打断的标签:\n', '') if row[13] else '' # print(task_name, task_stage, task_condition) if task_stage == '整体兜底': continue # 初始化任务 if task_name not in task_dict: task_dict[task_name] = [ # 进入条件 dict(), # 可选择的简版回复话术 list(), # 真实完整的回复话术 list(), # 任务是否完成的判断逻辑 task_finish_logic, # 任务运转逻辑 task_run_logic, # 以树结构保存的流程路线 list(), # 语句对应的标签可为空值 list() ] # 构建任务字典,任务转换字典 if '.' not in task_stage: task_conditions = task_condition.split('\n') if '\n' in task_condition else [task_condition, ] for condition in task_conditions: task_dict[task_name][0][condition] = [task_simple_seat_response, '《槽位id:{}》'.format(slot_id)+'@#'+task_wav_no+'||'+task_seat_response+'#@'] task_dict[task_name][1].append(task_simple_seat_response) task_dict[task_name][2].append( '《槽位id:{}》'.format(slot_id) + '@#' + task_wav_no + '||' + task_seat_response + '#@') task_dict[task_name][6].append(task_response_label) slot_id += 1 elif '【' in task_simple_seat_response and '】' in task_simple_seat_response: result = task_transfer_pattern.search(task_simple_seat_response) if result.group() == '【faq答案】' and \ task_simple_seat_response.replace('【faq答案】+', 'faq') not in task_dict[task_name][1]: task_dict[task_name][1].append(task_simple_seat_response.replace('【faq答案】+', 'faq')) task_dict[task_name][2].append('《槽位id:{}》'.format(slot_id)+task_label+'@#'+task_wav_no+'||'+task_seat_response+'#@') task_dict[task_name][6].append(task_response_label) slot_id += 1 else: result = result.group().replace('【', '').replace('】', '') if '\n' in task_condition: for condition in task_condition.split('\n'): task_transfer[task_name+'+'+condition] = result elif '|' in task_condition: task_transfer[task_name+'+'+task_condition.split('|')[0]] = result else: task_transfer[task_name+'+'+task_condition] = result elif task_simple_seat_response not in task_dict[task_name][1]: task_dict[task_name][1].append(task_simple_seat_response) task_dict[task_name][2].append('《槽位id:{}》'.format(slot_id)+task_label+'@#'+task_wav_no+'||'+task_seat_response+'#@') task_dict[task_name][6].append(task_response_label) slot_id += 1 # 构建bot树 first_stage = list() # 当节点中出现省略时,进行补充溯源 if 'x' in task_stage and 'y' in task_stage and 'z' in task_stage: for x_num in task_stage.split('\n')[1].split('x=')[-1].split(','): for y_num in task_stage.split('\n')[2].split('y=')[-1].split(','): for z_num in task_stage.split('\n')[3].split('z=')[-1].split(','): for tree in task_dict[task_name][5]: tree.create_node( task_condition, task_stage.split('\n')[0].replace('x', x_num).replace('y', y_num).replace('z', z_num), '.'.join(task_stage.split('\n')[0].replace('x', x_num).replace('y', y_num).replace('z', z_num).split('.')[:-1]), task_simple_seat_response.replace('【faq答案】+', 'faq'), ) elif 'x' in task_stage and 'y' in task_stage: for x_num in task_stage.split('\n')[1].split('x=')[-1].split(','): for y_num in task_stage.split('\n')[2].split('y=')[-1].split(','): for tree in task_dict[task_name][5]: tree.create_node( task_condition, task_stage.split('\n')[0].replace('x', x_num).replace('y', y_num), '.'.join(task_stage.split('\n')[0].replace('x', x_num).replace('y', y_num).split('.')[:-1]), task_simple_seat_response.replace('【faq答案】+', 'faq'), ) elif 'x' in task_stage: for num in task_stage.split('\n')[1].split('x=')[-1].split(','): # for num in task_stage.replace(')', '').split('x=')[-1].split(','): for tree in task_dict[task_name][5]: # print(task_stage.split('(')[0].replace('x', num)) tree.create_node( task_condition, task_stage.split('\n')[0].replace('x', num), '.'.join(task_stage.split('\n')[0].replace('x', num).split('.')[:-1]), task_simple_seat_response.replace('【faq答案】+', 'faq')) else: stage_level = task_stage.count('.') if stage_level == 0: tree = Tree() first_stage.append(task_simple_seat_response.replace('【faq答案】+', 'faq')) # tag=None(进入条件), identifier=None(节点编号), parent=None(上个节点编号), data=None(动作,回复话术或者已完成任务) tree.create_node(task_condition, task_stage, None, task_simple_seat_response.replace('【faq答案】+', 'faq')) # task_dict[task_name][5].append(tree) task_dict[task_name][5] = [tree, ] else: for tree in task_dict[task_name][5]: tree.create_node(task_condition, task_stage, '.'.join(task_stage.split('.')[:-1]), task_simple_seat_response.replace('【faq答案】+', 'faq')) # 尝试输出树结构 for tree in task_dict['收集资金用途'][5]: # print(len(tree.leaves())) # print(tree.leaves()) for leaf in tree.leaves(): for i, condition in enumerate(['金额', '肯定', '否定', '其他', 'FAQ']): tree.create_node( condition, leaf.identifier+'.'+str(i+1), leaf.identifier, '【促成】' ) # print(len(tree.leaves())) # print(tree.leaves()) # 尝试输出路径 '''tree = task_dict['核身'][5][0] tree = Tree() node_id_list = ['1', ] print(tree.get_node('1')) print(tree.children('1')[0] if tree.children('1') else '') print(tree.children(tree.children('1')[0].identifier)) # 获取所有叶子节点的路径 print(len(tree.paths_to_leaves())) for paths in tree.paths_to_leaves(): print(paths) for path_id in paths: print( '用户:{}\n销售员:{}'.format(tree.get_node(path_id).tag, tree.get_node(path_id).data) ) print(task_dict) print(task_transfer)''' # 构建话术选择训练数据 input_list = list() insert_data_list = list() nlu_data_list = list() task_paths = dict() for task_name in tqdm.tqdm(task_dict): for tree in task_dict[task_name][5]: task_paths[task_name] = list() for paths in tree.paths_to_leaves(): # if task_name == '收集资金用途' or task_name == '收集用户当前用款场景': # print(tree.get_node(paths[-1]).data) # 话术选择训练数据 if len(paths) >= 3: paths_list = [paths[:2], ] # paths_list = list() for i in range(len(paths) - 2): sublist = paths[i:i + 3] paths_list.append(sublist) for paths in paths_list: if paths not in task_paths[task_name]: task_paths[task_name].append(paths) # if paths == ['1', '1.5', '1.5.4']: # print(paths) data_list, input_list = path_to_dialogue_data( paths, task_dict, task_name, tree, input_list, FAQ_name_list, dialogue_prompt) insert_data_list.extend(data_list) else: if paths not in task_paths[task_name]: task_paths[task_name].append(paths) # if paths == ['1', '1.5', '1.5.4']: # print(paths) data_list, input_list = path_to_dialogue_data( paths, task_dict, task_name, tree, input_list, FAQ_name_list, dialogue_prompt) insert_data_list.extend(data_list) # 打标训练数据 '''paths_list = list() for i in range(len(paths) - 1): sublist = paths[i:i + 2] paths_list.append(sublist) for paths in paths_list: data_list, input_list = path_to_dialogue_data( paths, task_dict, task_name, tree, input_list, FAQ_name_list, nlu_prompt) nlu_data_list.extend(data_list)''' random.shuffle(insert_data_list) print(len(insert_data_list)) print(len(nlu_data_list)) jdump(insert_data_list, './v2_train_data_360UJD首贷.json') jdump(nlu_data_list, './v2_nlu_train_data_360UJD首贷.json') def path_to_dialogue_data(paths, task_dict, task_name, tree, input_list, FAQ_name_list, prompt): data_list = list() dialogues_list = ['', ] task_transfer_pattern = re.compile(r'【.*】') for i, path_id in enumerate(paths): dialogues_mid_list = list() for dialogues in dialogues_list: # dialogues_mid_list = list() # print(tree.get_node(path_id).tag) if 'FAQ' in tree.get_node(path_id).tag and i != 0: # print(tree.get_node(path_id).tag) unpick_faq_name_list = tree.get_node(path_id).tag.replace('FAQ', '').split('|') for faq_name in FAQ_name_list: if faq_name not in unpick_faq_name_list: dialogues_mid_list.append( dialogues + '用户:{}\n销售员:{}\n'.format(faq_name, tree.get_node(path_id).data) ) elif '\n' in tree.get_node(path_id).tag and i != 0: faq_name_list = tree.get_node(path_id).tag.split('\n') for faq_name in faq_name_list: dialogues_mid_list.append( dialogues + '用户:{}\n销售员:{}\n'.format(faq_name, tree.get_node(path_id).data) ) else: dialogues_mid_list.append( dialogues + '用户:{}\n销售员:{}\n'.format(tree.get_node(path_id).tag, tree.get_node(path_id).data) ) dialogues_list = dialogues_mid_list # if paths == ['1', '1.5', '1.5.4']: # if task_name == '收集用户当前的用款方案': # print(dialogues_list) # if task_name == '收集用户当前的用款方案': # print(paths) # print('最终对话列表:', dialogues_list) dialogues_mid_list = list() if paths[0] == '1': for dialogues in dialogues_list: for enter_condition in task_dict[task_name][0]: if enter_condition not in dialogues: dialogues_mid_list.append('用户:卡卡卡\n销售员:'+enter_condition+'用户:'.join(dialogues.split('用户:')[1:])) elif enter_condition in dialogues: dialogues_mid_list.append(dialogues) dialogues_list = dialogues_mid_list # 过滤已有话术 for dialogues in dialogues_list: input = '销售员:' + '销售员:'.join(dialogues.split('销售员:')[1:-1]) dialogue_list = list() for dialogue in task_dict[task_name][1]: # if dialogue not in input: dialogue_list.append(dialogue) instruction = prompt.format(task_dict[task_name][3], task_dict[task_name][4], '|'.join(dialogue_list) if dialogue_list else '', ) output = dialogues.split('销售员:')[-1].replace('\n', '') result = task_transfer_pattern.search(output) if result and result.group() != '【faq答案】': # if '【' in output and '】' in output and '【】': output = '任务完成' else: output = output if input not in input_list: input_list.append(input) data_list.append({ 'instruction': instruction, 'input': input, 'output': output, }) return data_list, input_list def bot_document_to_faq(file_path): wb = load_workbook(file_path) ws = wb[wb.sheetnames[1]] workspace_all_intent_list = ['不需要-无原因', '不需要-会考虑', '不需要-不缺钱'] FAQ_answer_dict = dict() FAQ_name_list = list() FAQ_off_list = list() for i, row in tqdm.tqdm(enumerate(ws.values)): if i != 0 and row[0]: faq_name = row[0] faq_wav_no = row[1] faq_label = row[2] faq_seat_response = row[3] faq_sign = row[5] FAQ_answer_dict[faq_name] = [faq_seat_response, faq_wav_no, faq_label] if '轮询' not in faq_name and \ faq_name not in ['投诉', '别给我打电话了', '你怎么有我号码', '强烈拒绝', '语音信箱', '第一次静音', '第二次静音', '第三次静音', '多次不需要', '多次在忙']: FAQ_name_list.append(faq_name) if faq_sign == '挂机': FAQ_off_list.append(faq_name) for other_intent in workspace_all_intent_list: if other_intent not in FAQ_name_list: FAQ_name_list.append(other_intent) return FAQ_off_list, FAQ_name_list, FAQ_answer_dict def bot_document_to_tree(file_path): wb = load_workbook(file_path) ws = wb[wb.sheetnames[0]] FAQ_off_list, FAQ_name_list, FAQ_answer_dict = bot_document_to_faq(file_path) print(FAQ_name_list) task_transfer_pattern = re.compile(r'【.*】') task_name = '' task_dict = dict() for i, row in enumerate(ws.values): task_name = row[0] if row[0] else task_name if task_name == '流程文档结束': break task_finish_logic = row[1] task_run_logic = row[2] task_stage = str(row[3]) task_condition = row[4] task_wav_no = row[5] task_seat_response = row[6] task_simple_seat_response = row[7].replace('【faq答案】+', 'faq') task_label = '[' + row[9] + ']' if row[9] else '' task_response_label = '[' + row[8] + ']' if row[8] else '' task_break_label = row[13].replace('开场支持打断的标签:\n', '') if row[13] else '' if task_stage == '整体兜底': continue if task_name not in task_dict: task_dict[task_name] = { 'tree': dict(), # 以树结构保存的流程路线 'tree_path': list(), 'enter_simple_response': dict(), 'simple_response': list(), 'simple_response_to_response': dict(), 'next_task': dict() } # 添加选择的简版话术 if task_simple_seat_response not in task_dict[task_name]['simple_response']: task_dict[task_name]['simple_response'].append(task_simple_seat_response) task_dict[task_name]['simple_response_to_response'][task_simple_seat_response] = { 'wav_no': task_wav_no, 'response': task_seat_response, 'label': task_label, 'response_label': task_response_label, } # 构建bot树 # 当节点中出现省略时,进行补充溯源 if 'x' in task_stage and 'y' in task_stage and 'z' in task_stage: for x_num in task_stage.split('\n')[1].split('x=')[-1].split(','): for y_num in task_stage.split('\n')[2].split('y=')[-1].split(','): for z_num in task_stage.split('\n')[3].split('z=')[-1].split(','): for condition in task_dict[task_name]['tree']: tree = task_dict[task_name]['tree'][condition] tree.create_node( task_condition, task_stage.split('\n')[0].replace('x', x_num).replace('y', y_num).replace('z', z_num), '.'.join(task_stage.split('\n')[0].replace('x', x_num).replace('y', y_num).replace('z', z_num).split('.')[:-1]), # task_simple_seat_response, task_wav_no if task_wav_no else task_simple_seat_response, ) elif 'x' in task_stage and 'y' in task_stage: for x_num in task_stage.split('\n')[1].split('x=')[-1].split(','): for y_num in task_stage.split('\n')[2].split('y=')[-1].split(','): for condition in task_dict[task_name]['tree']: tree = task_dict[task_name]['tree'][condition] tree.create_node( task_condition, task_stage.split('\n')[0].replace('x', x_num).replace('y', y_num), '.'.join(task_stage.split('\n')[0].replace('x', x_num).replace('y', y_num).split('.')[:-1]), # task_simple_seat_response, task_wav_no if task_wav_no else task_simple_seat_response, ) elif 'x' in task_stage: for num in task_stage.split('\n')[1].split('x=')[-1].split(','): # for num in task_stage.replace(')', '').split('x=')[-1].split(','): for condition in task_dict[task_name]['tree']: tree = task_dict[task_name]['tree'][condition] tree.create_node( task_condition, task_stage.split('\n')[0].replace('x', num), '.'.join(task_stage.split('\n')[0].replace('x', num).split('.')[:-1]), # task_simple_seat_response, task_wav_no if task_wav_no else task_simple_seat_response, ) else: stage_level = task_stage.count('.') if stage_level == 0: task_condition = task_condition.split('\n') if '\n' in task_condition else [task_condition, ] for condition in task_condition: tree = Tree() # tag=None(进入条件), identifier=None(节点编号), parent=None(上个节点编号), data=None(动作,回复话术或者已完成任务) tree.create_node(condition, task_stage, None, # task_simple_seat_response, task_wav_no if task_wav_no else task_simple_seat_response,) task_dict[task_name]['tree'][condition] = tree task_dict[task_name]['enter_simple_response'][condition] = task_simple_seat_response else: for condition in task_dict[task_name]['tree']: tree = task_dict[task_name]['tree'][condition] tree.create_node(task_condition, task_stage, '.'.join(task_stage.split('.')[:-1]), # task_simple_seat_response, task_wav_no if task_wav_no else task_simple_seat_response,) # 尝试输出树结构 for condition in task_dict['收集资金用途']['tree']: tree = task_dict['收集资金用途']['tree'][condition] for leaf in tree.leaves(): for i, condition in enumerate(['金额', '肯定', '否定', '其他', 'FAQ']): tree.create_node( condition, leaf.identifier + '.' + str(i + 1), leaf.identifier, '【促成】' ) # 尝试输出树结构 '''for condition in task_dict['收集资金用途']['tree']: tree = task_dict['收集资金用途']['tree'][condition] tree.show(idhidden=False)''' all_num = 0 # 每个子树获取所有到叶子节点的路径 for task_name in task_dict: if task_name == '收集用户当前用款场景': break for condition in task_dict[task_name]['tree']: for paths in task_dict[task_name]['tree'][condition].paths_to_leaves(): path_list = list() for path_id in paths: path_list.append(task_dict[task_name]['tree'][condition].get_node(path_id).tag) path_list.append(task_dict[task_name]['tree'][condition].get_node(path_id).data) # 添加路径 task_dict[task_name]['tree_path'].append(path_list) # if path_list[-2] == '建材': # print(paths) # 添加到达其他任务的路径 leave_data = task_transfer_pattern.search(task_dict[task_name]['tree'][condition].get_node(paths[-1]).data) if leave_data: leave_data = leave_data.group().replace('【', '').replace('】', '') if leave_data not in task_dict[task_name]['next_task'] or \ (leave_data in task_dict[task_name]['next_task'] and len(task_dict[task_name]['next_task'][leave_data]) > len(path_list)): task_dict[task_name]['next_task'][leave_data] = path_list all_num += len(task_dict[task_name]['tree_path']) print(task_name) print(task_dict[task_name]['tree_path']) print(task_dict[task_name]['next_task']) print(all_num) # 从核身开场开始进行数据构建 finish_paths_list = list() unfinish_paths_list = list() finish_paths_list.extend(task_dict['核身']['tree_path']) # print(task_dict['核身']['tree_path']) for next_task in task_dict['核身']['next_task']: unfinish_paths_list.append( task_dict['核身']['next_task'][next_task] ) # print(finish_paths_list) # print(unfinish_paths_list) while unfinish_paths_list: paths_list = list() for path in tqdm.tqdm(unfinish_paths_list): task_name = task_transfer_pattern.search(path[-1]).group().replace('【', '').replace('】', '') for next_task_path in task_dict[task_name]['tree_path']: finish_path = copy.deepcopy(path[:-1]) finish_path.extend(next_task_path[1:]) if finish_path not in finish_paths_list: finish_paths_list.append(finish_path) # print(finish_path) for next_task in task_dict[task_name]['next_task']: finish_path = copy.deepcopy(path[:-1]) finish_path.extend(task_dict[task_name]['next_task'][next_task][1:]) paths_list.append(finish_path) unfinish_paths_list = paths_list # print(finish_paths_list) # print(unfinish_paths_list) # print(len(finish_paths_list)) path_list = list() test_path_list= list() for path in finish_paths_list: # 替换路径找到对应的录音编号 if '【' in path[-1] and '】' in path[-1]: enter_condition = path[-1].split('】')[-1] # enter_condition = path[-2] print(path) task_name = task_transfer_pattern.search(path[-1]).group().replace('【', '').replace('】', '') print(enter_condition, task_name) print(task_dict[task_name]['enter_simple_response']) if enter_condition and enter_condition in task_dict[task_name]['enter_simple_response']: simple_response = task_dict[task_name]['enter_simple_response'][enter_condition] else: simple_response = task_dict[task_name]['enter_simple_response']['首句'] wav_no = task_dict[task_name]['simple_response_to_response'][simple_response]['wav_no'] print(wav_no) path[-1] = wav_no # 生成测试数据 user_response = list() for i, row in enumerate(path[2:]): # 优化row if 'FAQ' in row: row = 'FAQ' if '不需要-无原因' in row: row = '不需要' if '在忙-无原因' in row: row = '在忙' if row == '静音': row = '@@quiet@@' if '\n' in row: row = row.split('\n')[0] if row in ['平台银行用款方案', '平台产品']: row = '花呗' if row == '金额': row = '三万' if row == '其他': row = '不明' if row in ['资金周转', '生活消费']: row = '肯定' if i % 2 != 0: # if ','.join(user_response) == '什么平台,不需要-使用其他平台,肯定,建材': # print(','.join(user_response)+'+'+row) if ','.join(user_response) not in path_list: test_path_list.append(','.join(user_response)+'+'+row) path_list.append(','.join(user_response)) # print(path) # print(','.join(user_response), row) else: user_response.append(row) wb = Workbook() ws = wb.active for paths in tqdm.tqdm(test_path_list): ws.append(['', '', paths.split('+')[0], paths.split('+')[1]]) wb.save('./test_dialogue_data.xlsx') def get_wav_online_content(company_id, tts_model): url = 'http://work.xi-ai.com/admin/soundRecording/getRecordManageList?companyId={}&ttsModel={}'.format(company_id, tts_model) wav_key = '{}_{}'.format(company_id, tts_model) wav_dict = { wav_key: dict() } response = json.loads(requests.get(url).text) for wav_data in response['data'][0]['list']: file_name = wav_data['fileName'] content = wav_data['content'] print(file_name, content) wav_dict[wav_key][file_name] = content return wav_dict # 打标训练,测试数据生成 def bot_document_to_nlu(file_path): intent_dict = FAQ_file() wb = load_workbook('./[360-UJD-首贷].xlsx') ws = wb[wb.sheetnames[0]] simple_list =list() task_transfer_pattern = re.compile(r'【.*】') for i, row in enumerate(ws.values): if str(row[3]) == '整体兜底': break task_simple_seat_response = row[7].replace('【faq答案】+', 'faq') if '【' not in task_simple_seat_response and '】' not in task_simple_seat_response and \ '再见' not in task_simple_seat_response and task_simple_seat_response not in simple_list: simple_list.append(task_simple_seat_response) print(len(simple_list)) instruction = '''接下来会有一个用户的表述,你需要做的是用一个词代表用户的表述含义。''' data_list = list() '''for simple_response in simple_list[:-1]: print(simple_response) for intent_name in intent_dict: for query in intent_dict[intent_name][0]: data_list.append({ 'instruction': instruction, 'input': '销售员:'+simple_response+'\n用户:'+query+'\n', 'output': intent_name })''' for intent_name in intent_dict: for query in intent_dict[intent_name][0]: data_list.append({ 'instruction': instruction, 'input': '用户:'+query+'\n', 'output': intent_name }) print(len(data_list)) random.shuffle(data_list) jdump(data_list, './nlu_train_data.json') def bot_document_seat_response(file_path): wb = load_workbook(file_path) ws = wb[wb.sheetnames[0]] seat_response = set() task_transfer_pattern = re.compile(r'【.*】') for i, row in enumerate(ws.values): if row[7]: task_simple_seat_response = row[7].replace('【faq答案】+', 'faq') if '再见' in task_simple_seat_response: continue result = task_transfer_pattern.search(task_simple_seat_response) if result and '+' in result.group(): task_simple_seat_response = result.group().split('+')[0].replace('【', '').replace('】', '') elif result: continue seat_response.add(task_simple_seat_response) print(seat_response) return list(seat_response) def request_chatgpt(content): # print(content) prompt = [{'role': 'system', 'content': content}, ] response = openai.ChatCompletion.create( api_type="azure", api_version="2023-03-15-preview", api_base="https://lingxi-openai.openai.azure.com", api_key="45a5ee249f364e208dd950f87ab5aba7", engine="gpt-35", messages=prompt, temperature=0.8, max_tokens=2048, request_timeout=10, ) result = response["choices"][0]['message']['content'] # print(result) return result def FAQ_file(): wb = load_workbook('./FAQ.xlsx') ws = wb[wb.sheetnames[0]] intent_dict = dict() for i, row in enumerate(ws.values): if i != 0 and row[5] == 1: intent_name = row[1] query = row[2] # use_sign = 1 if row[5] == 1 else 0 if intent_name == 'NOINTENT': intent_name = row[6] if intent_name in ['在操作', ]: # continue intent_name = '正在操作' if intent_name in ['在忙-无原因', '在忙-有原因', ]: intent_name = '没时间' if intent_name == '在忙-快点说': intent_name = '快点说' if intent_name == '在忙-主动邀约': intent_name = '主动邀约' if intent_name not in intent_dict: intent_dict[intent_name] = [list(), ''] intent_dict[intent_name][0].append(query) ws = wb[wb.sheetnames[1]] for i, row in enumerate(ws.values): intent_name = row[0] intent_meaning = row[1] # use_sign = 1 if row[5] == 1 else 0 if intent_name in intent_dict: intent_dict[intent_name][1] = intent_meaning return intent_dict def check_360_nlu(): url = 'http://8.142.8.47:8681/nlu?session_id=-1&workspace=222¤t_query={}' dm_url = 'https://work.xi-ai.com/dataCenter/dm/detail?sessionId={}' all_case = list() for i in ['11', '14', '16', '18']: connection = pymysql.connect( host="39.103.215.119", # host="am-8vbwn20384jdq3vq185480.zhangbei.ads.aliyuncs.com", port=3308, # port=3306, user="ds_user", passwd="Moxi123#", # db="data_center_temp", charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) cursor = connection.cursor() sql = ''' select dm_session_id, customer_id from ods_outbound_data_platform.outbound_call_result where call_start_time >= "2023-08-24 {}:00:00" and call_start_time <= "2023-08-24 {}:10:00" and company_id = 2186 and call_status in ("normalConnection", "transferFail") '''.format(i, i) cursor.execute(sql) results = cursor.fetchall() all_case.extend(results) wb = Workbook() ws = wb.active ws.append([ '上文', '语句', '线上标签', '对比标签' ]) for result in tqdm.tqdm(all_case): session_id = result['dm_session_id'] response = json.loads(requests.get(dm_url.format(session_id)).text)['data'] last_seat_query = '' for res in response: if res['speakerType'] == 'USER' and res['idlResultJson']: result_json = json.loads(res['idlResultJson']) intent = result_json['standardQuery'] attitude = result_json['originalAttitude'] query = result_json['query'] if intent and intent not in ['NOINTENT', '肯定态度', '否定态度', '无态度']: online_label = intent else: if attitude == 1: online_label = '肯定态度' elif attitude == -1: online_label = '否定态度' else: online_label = '无态度' response = json.loads(requests.get(url.format(query)).text) intent = response['standard_query'] attitude = response['original_attitude'] if intent and intent not in ['NOINTENT', '肯定态度', '否定态度', '无态度']: label = intent else: if attitude == 1: label = '肯定态度' elif attitude == -1: label = '否定态度' else: label = '无态度' if label != online_label: print(last_seat_query, ) print(query, online_label, label) ws.append([ last_seat_query, query, online_label, label ]) elif res['speakerType'] == 'IVR': last_seat_query = res['msgContent'] wb.save('./360_1.0对比结果.xlsx') def dialogue_data(): connection = pymysql.connect( host="39.103.215.119", # host="am-8vbwn20384jdq3vq185480.zhangbei.ads.aliyuncs.com", port=3308, # port=3306, user="ds_user", passwd="Moxi123#", # db="data_center_temp", charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) cursor = connection.cursor() sql = ''' select dm_session_id, customer_id from ods_outbound_data_platform.outbound_call_result where call_start_time >= "2023-09-07 00:00:00" and call_start_time <= "2023-09-07 12:00:00" and robot_answer_duration > 30 and dm_version in ("JT-实时-实分转-淑婷-20230221新版A") and call_status in ("normalConnection", "transferFail") ''' cursor.execute(sql) results = cursor.fetchall() # dm_url = 'https://work.xi-ai.com/dataCenter/dm/detail?sessionId={}' dm_url = 'http://172.26.2.56:8630/report/getDetailedRecord/?sessionId={}' print(len(results)) wb = Workbook() ws = wb.active ws.append([ '客户ID', 'session_id', '角色', '语句', '意图', '态度' ]) query_dict = dict() query_list = list() for result in tqdm.tqdm(results): session_id = result['dm_session_id'] customer_id = result['customer_id'] response = json.loads(requests.get(dm_url.format(session_id)).text)['result'] # if response[-1]['dialogueRound'] > 2: for res in response: if res['speakerType'] == 'USER' and res['idlResultJson']: result_json = json.loads(res['idlResultJson']) intent = result_json['standardQuery'] if result_json['standardQuery'] else "NOINTENT" attitude = result_json['originalAttitude'] query = result_json['query'] # query = process_queries(query) if query not in query_list: query_list.append(query) ws.append([ customer_id, session_id, query, intent, attitude ]) if query not in query_dict: query_dict[query] = 1 else: query_dict[query] += 1 ''' msg_content = res['msgContent'] intent = '' attitude = '' if res['idlResultJson']: result_json = json.loads(res['idlResultJson']) intent = result_json['standardQuery'] if result_json['standardQuery'] else "NOINTENT" attitude = result_json['originalAttitude'] ws.append([ customer_id, session_id, res['speakerType'], msg_content, intent, attitude ]) ''' ws_2 = wb.create_sheet('语料出现次数') ws_2.append([ '语句', '次数' ]) for query in tqdm.tqdm(query_dict): if query_dict[query] > 1: ws_2.append([ query, query_dict[query] ]) wb.save('./标注测试.xlsx') def process_queries(current_query, ): # 删除相近相同词 sign = True while sign: i = 0 sign = False record_list = list() final_str = list() while i < len(current_query): repeat_num = 0 for j in range(1, 6): target = current_query[i:i + j] while True: if target == current_query[i + j * (repeat_num + 1):i + j * (repeat_num + 2)]: repeat_num += 1 else: break if repeat_num: record_append = (i, j, repeat_num) break if repeat_num: if target in ['', ] or not (u'\u4e00' <= target <= u'\u9fff'): final_str.append(target * 2) else: final_str.append(target) sign = True record_list.append(target) i = i + j * (repeat_num + 1) continue final_str.append(current_query[i]) i += 1 current_query = ''.join(final_str) # 将被标点符号隔开的相同字合并在一块 current_query_result = '' for i in range(len(current_query)): if current_query[i] in [',', '。', '?', '!', '、']: for j in range(1, 5): if current_query[i - j if i - j >= 0 else 0:i] == current_query[i + 1:i + j + 1]: current_query_result = current_query_result[:-1 * j] break current_query_result += current_query[i] current_query = current_query_result for point_sign in ['?。', '?,', ',。', ',,', '。,', '。。']: current_query = current_query.replace(point_sign, ',') if current_query[0] in [',', '。']: current_query = current_query[1:] return current_query def process_data_to_llama2(): wb = load_workbook('./标注测试.xlsx') ws = wb[wb.sheetnames[0]] session_id_dict = dict() for i, row in tqdm.tqdm(enumerate(ws.values)): if i != 0 and row[2]: content = row[2] session_id = row[1] if session_id not in session_id_dict: session_id_dict[session_id] = list() if content and content.startswith('《'): speaker_type = 'gpt' content_result = '' for con in content.split('@#')[1:]: # print(con.split('#@')[0].split('||')[-1]) content_result += con.split('#@')[0].split('||')[-1] content = content_result else: speaker_type = 'human' content = content.split(']')[-1] # print(speaker_type, content) session_id_dict[session_id].append([speaker_type, content]) for session_id in session_id_dict: last_speaker_type = '' dialogue_list = list() for msg in session_id_dict[session_id]: speaker_type = msg[0] content = msg[1] if last_speaker_type != speaker_type: dialogue_list.append([speaker_type, content]) last_speaker_type = speaker_type else: dialogue_list[-1][-1] += content session_id_dict[session_id] = dialogue_list data_list = list() for session_id in session_id_dict: # print(session_id, len(session_id_dict[session_id])) data = { 'id': generate_uuid(), 'model': '', 'conversations': [{ 'from': 'human', 'value': '你好。' }], } for msg in session_id_dict[session_id]: speaker_type = msg[0] content = msg[1] data['conversations'].append({ 'from': speaker_type, 'value': process_queries(content) if speaker_type == 'human' else content }) data_list.append(data) # random.shuffle(data_list) # jdump(data_list, './2244_dialogue.json') return data_list if __name__ == '__main__': # 产生职业与归类 content = ''' 你扮演一个用户,有人询问“您的职业是什么”,给出50个职业名称,可以是自己做生意的,也可以是上班的,并将每个职业名称在“建材生意,工程生意,开店做生意,养殖生意,其他生意,上班”中选择一个进行归类,给出的生意不用受归类的影响。 输出格式以json格式输出,key为归类名称,value为同归类的列表集合。 ''' # bot_document_to_dialogue('./[360-UJD-首贷].xlsx') # bot_document_to_tree('./[360-UJD-首贷].xlsx') # bot_document_to_nlu('./credit_intent.xlsx') # FAQ_file() # check_360_nlu() dialogue_data() '''data_list = process_data_to_llama2() out_data_list = list() cot_data = jload('./cot_2023-08-25.json') for data in cot_data: data['model'] = '' out_data_list.append(data) cot_data = jload('./counterfactural_correction_multi_round_chat.json') for data in cot_data: out_data_list.append({ 'id': generate_uuid(), 'model': '', 'conversations': [{ 'from': 'human', 'value': data['instruction'], }, { 'from': 'gpt', 'value': data['output'], }] }) random.shuffle(out_data_list) out_data_list = out_data_list[:len(data_list)//2] data_list.extend(out_data_list) jdump(data_list, './2244_dialogue.json')'''