# -*- coding:utf-8 -*- # @Time : 2023/7/13 3:26 PM # @Author: chenyong # @Email : chenyong@lingxi.ai # @File : update.py import tqdm import pymysql import requests import pandas as pd # from remove_repeat_word import process_queries, rm_stop_word from openpyxl import load_workbook, Workbook sheet_num = 0 def read_xlsx(file_path): wb = load_workbook(file_path) ws = wb[wb.sheetnames[sheet_num]] xlsx_list = list() for i, row in enumerate(ws.values): if i != 0 and row[2]: query, label = row["query"], row["label"] xlsx_list.append((query, label)) return xlsx_list def update_query_with_mysql(query, new_query, label): connection = pymysql.connect(host="47.92.193.147", port=3306, user="root", passwd="Moxi123#", db="task_dialogue_config", charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() sql = \ ''' SELECT QUESTION_ID, QUERY_TITLE FROM N_RECORDED_QUERY WHERE SYSTEM_ID = {} AND QUERY_TITLE = '{}' ''' \ .format(str(223), query) cursor.execute(sql) ori_id_list = cursor.fetchall() if not ori_id_list: print("没有找到query:[ {} ]".format(query)) return id = ori_id_list[0]["QUESTION_ID"] sql1 = \ ''' UPDATE N_RECORDED_QUERY SET QUERY_TITLE = '{}' WHERE SYSTEM_ID = '{}' AND QUERY_TITLE = '{}' ''' \ .format(new_query, 223, query) cursor.execute(sql1) connection.commit() cursor.close() connection.close() return id def update_with_mysql(query, label): connection = pymysql.connect(host="47.92.193.147", port=3306, user="root", passwd="Moxi123#", db="task_dialogue_config", charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # 获取人工标签的 QUERY_ANSWER 和 ANSWER_ID sql = \ ''' SELECT NRQ.QUERY_ANSWER, NRQ.ANSWER_ID, NRQ.QA_TYPE_ID FROM N_RECORDED_QUERY NRQ WHERE NRQ.SYSTEM_ID = {} AND NRQ.QUESTION_ID = NRQ.ANSWER_ID AND NRQ.QUERY_TITLE = '{}' '''\ .format(str(workspace), str(label)) # sql = "DELETE FROM N_RECORDED_QUERY WHERE ID = 1096853" cursor.execute(sql) change_to = cursor.fetchall() if not change_to: print("没有找到标签[{}]".format(label)) return QUERY_ANSWER, ANSWER_ID, QA_TYPE_ID = change_to[0].values() # print(QUERY_ANSWER, ANSWER_ID, QA_TYPE_ID) sql_exam = \ ''' SELECT NRQ.QUERY_ANSWER FROM N_RECORDED_QUERY NRQ WHERE NRQ.SYSTEM_ID = {} AND NRQ.QUERY_TITLE = '{}' ''' \ .format(str(workspace), str(query)) cursor.execute(sql_exam) cur_label = cursor.fetchall() if not cur_label: print("没有找到query:[ {} ]".format(query)) return if cur_label[0]["QUERY_ANSWER"] == label: print("[ {} ]的标签已经是[ {} ]".format(query, label)) return sql1 = \ ''' UPDATE N_RECORDED_QUERY NRQ SET NRQ.QUERY_ANSWER = '{}', NRQ.ANSWER_ID = '{}', NRQ.QA_TYPE_ID = '{}' WHERE NRQ.SYSTEM_ID = '{}' AND NRQ.QUERY_TITLE = '{}' ''' \ .format(QUERY_ANSWER, ANSWER_ID, QA_TYPE_ID, workspace, query) cursor.execute(sql1) connection.commit() sql2 = \ ''' SELECT ID, QUERY_TITLE FROM N_RECORDED_QUERY NRQ WHERE NRQ.SYSTEM_ID = '{}' AND NRQ.QUERY_TITLE = '{}' ''' \ .format(workspace, query) cursor.execute(sql2) id_res = cursor.fetchall() id, query = "", "" if id_res: id, query = id_res[0].values() cursor.close() connection.close() return id def delete_with_mysql(query, label): connection = pymysql.connect(host="47.92.193.147", port=3306, user="root", passwd="Moxi123#", db="task_dialogue_config", charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # 获取人工标签的 QUERY_ANSWER 和 ANSWER_ID sql = \ ''' SELECT ID, QUERY_TITLE FROM N_RECORDED_QUERY WHERE SYSTEM_ID = {} AND QUERY_TITLE = '{}' '''\ .format(str(workspace), str(query)) cursor.execute(sql) text = cursor.fetchall() if not text: print("没有找到query:[ {} ]".format(query)) return to_del_id = text[0]["ID"] # [{'ID': 1182495, 'QUERY_TITLE': '暂时不考虑,什么事'}] sql_del = \ ''' DELETE FROM N_RECORDED_QUERY WHERE SYSTEM_ID = {} AND ID = {} ''' \ .format(str(workspace), to_del_id) cursor.execute(sql_del) connection.commit() cursor.close() connection.close() return to_del_id def update_es(workspace, id): req_addr = "http://172.26.4.253:8456/update_single_qa?systemId={}&knowledge_id={}".format(workspace, id) print(req_addr) for _ in range(2): try: html = requests.get(req_addr, timeout=5).text print(html) break except requests.exceptions.RequestException as e: print(e) continue def linshi(): connection = pymysql.connect(host="47.92.193.147", port=3306, user="root", passwd="Moxi123#", db="task_dialogue_config", charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() sql = 'SELECT ID, QUERY_TITLE, ANSWER_ID ' \ 'FROM N_RECORDED_QUERY ' \ 'WHERE QA_TYPE_ID = 0 ' \ 'AND SYSTEM_ID = {};' sql_2 = 'SELECT QA_TYPE_ID ' \ 'FROM N_RECORDED_QUERY ' \ 'WHERE QUESTION_ID = {}; ' sql_3 = ''' UPDATE N_RECORDED_QUERY SET QA_TYPE_ID = '{}' WHERE SYSTEM_ID = '{}' AND ID = '{}'; ''' \ for workspace in ['222', '307', '313', '329', '341']: cursor.execute(sql.format(workspace)) results = cursor.fetchall() print( len(results) ) ID_list = list() for result in tqdm.tqdm(results): # print( result ) cursor.execute(sql_2.format(result['ANSWER_ID'])) res = cursor.fetchall() # print( res[0]['QA_TYPE_ID'] ) # print(sql_3.format(res[0]['QA_TYPE_ID'], 303, result['ID'])) cursor.execute(sql_3.format(res[0]['QA_TYPE_ID'], workspace, result['ID'])) # res = cursor.fetchall() ID_list.append(result['ID']) # update_es(303, result['ID']) connection.commit() for id in tqdm.tqdm(ID_list): update_es(workspace, id) def rm_stop_word(query): stop_word = ['quiet', '@', '不好意思', '对不起', '好谢谢', '谢谢您', '谢谢你', '拜拜', '谢谢', '好吧', '好嘞', '你好', '您好', '然后', '抱歉', '再见', '再见', '受累', '姐夫', '嫂子', '大哥', '老妹', '兄弟', '美女', '谢', '哦', '啊', '嘞', '喂', '啦', '唉', '哎', '哥', '姐', '哈', '呐', '呃', '噢', '诶', '噢', '唔', '呢', '呀', '嗯,', '嗯。', '嗯', ',好,', '。好,', '。好。', ',行,', ',行。', '。行。', ',对,', ',对。', '。对。', ] if query == "@@quiet@@": return query for stop in stop_word: if len(query.replace(stop, '').replace(',', '').replace(',', '').replace('。', '').replace('?', '')) > 0: query = query.replace(stop, '') # query = query.replace(stop, '') for start_word in ['好,', '好。', '行,', '行。', '对,', '对。']: if query.startswith(start_word) and len(query[2:].replace(',', '').replace('。', '').replace('?', '')) >= 2: query = query[2:] return query def process_queries(current_query): pass_word = [ # '不需要了', '不需要', '不用了', '不用', '不要', '不要了', '一点', '考虑', '谢', '在', '看', '问', '想', '天', '刚', '试', '拜'] '''remove_word = ['嗯,', '嗯。', '嗯', ',好,', '。好,', '。好。', ',行,', ',行。', '。行。', ',对,', ',对。', '。对。'] for remove in remove_word: if len( current_query.replace(remove, '') ) > 2: current_query = current_query.replace(remove, '')''' # 删除相近相同词 sign = True while sign: i = 0 sign = False record_list = list() final_str = list() while i < len(current_query): repeat_num = 0 for j in range(1, 7): target = current_query[i:i + j] while True: if target == current_query[i + j * (repeat_num + 1):i + j * (repeat_num + 2)]: repeat_num += 1 else: break if repeat_num: record_append = (i, j, repeat_num) break if repeat_num: if target in pass_word or not (u'\u4e00' <= target <= u'\u9fff'): final_str.append(target * 2) else: final_str.append(target) sign = True record_list.append(target) i = i + j * (repeat_num + 1) continue final_str.append(current_query[i]) i += 1 current_query = ''.join(final_str) try: # 去除"好","行" current_query_list = list() for i in range(len(current_query)): if current_query[i] in ['好', '行']: if i == 0: if current_query[i+1] == ',': continue elif i == len(current_query) - 1: if current_query[i-1] == ',': continue current_query_list.append(current_query[i]) if len(current_query_list) > 2: current_query = ''.join(current_query_list) except Exception: pass # 将被标点符号隔开的相同字合并在一块 current_query_result = '' for i in range(len(current_query)): if current_query[i] in [',', '。', '?', '!']: for j in range(1, 5): if current_query[i - j if i - j >= 0 else 0:i] == current_query[i + 1:i + j + 1]: current_query_result = current_query_result[:-1 * j] break current_query_result += current_query[i] current_query = current_query_result for point_sign in ['?。', '?,', ',。', ',,', '。,', '。。', ',,']: current_query = current_query.replace(point_sign, ',') if len(current_query) > 0 and current_query[0] in [',', '。']: current_query = current_query[1:] return current_query '''def read_excel(file): datas = pd.read_excel(file, sheet_name='Sheet1', header=0, index_col=None,) data = {} for index, row in datas.iterrows(): text = row['文本'] label = row['意图'] data[text] = label return data''' def read_excel(): wb = load_workbook('./清洗后数据.xlsx') ws = wb[wb.sheetnames[0]] data = dict() for i, row in enumerate(ws.values): if i != 0: intent_name = row[1] query = row[2] data[query] = intent_name return data def insert_new_data(): connection = pymysql.connect(host="172.26.172.65", port=3306, user="root", passwd="Moxi123#", db="task_dialogue_config", charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # 根据标准问查找ANSWER_ID sql_1 = """ select ANSWER_ID from N_RECORDED_QUERY where SYSTEM_ID=223 and QUERY_ANSWER='{}' limit 1; """ # 插入新数据 sql_2 = """ insert into N_RECORDED_QUERY (SYSTEM_ID, QUERY_ANSWER, ANSWER_ID, ENABLE, FLAG, QUERY_TITLE, QA_TYPE_ID) VALUES (223, '{}', {}, 1, 1, '{}', 482) """ # 根据新数据查找ID sql_3 = """ select ID from N_RECORDED_QUERY where SYSTEM_ID=223 and QUERY_TITLE='{}'; """ # file = "/Users/cy/work/NLU/语料库数据插入/insert_data.xlsx" data = read_excel() # print(data) ID_list = [] for query, label in tqdm.tqdm(data.items()): # print(query, label) query = rm_stop_word(query) query = process_queries(query) query = rm_stop_word(query) # print(query, label) new_label = label # print(sql_1.format(new_label)) cursor.execute(sql_1.format(new_label)) results_1 = cursor.fetchall() if len(results_1) > 0: new_ANSWER_ID = results_1[0]['ANSWER_ID'] # print(new_ANSWER_ID) cursor.execute(sql_2.format(new_label, new_ANSWER_ID, query)) # print(sql_2.format(new_label, new_ANSWER_ID, query)) cursor.execute(sql_3.format(query)) # print(sql_3.format(query)) results_3 = cursor.fetchall() ori_ID = results_3[0]['ID'] # print(ori_ID) ID_list.append(ori_ID) # update_es(223, ori_ID) connection.commit() for id in tqdm.tqdm(ID_list): print(id) update_es(223, id) connection.close() def qingxi(): connection = pymysql.connect(host="47.92.193.147", port=3306, user="root", passwd="Moxi123#", db="task_dialogue_config", charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # 待更新的ID sql = """ SELECT QUERY_ANSWER, ANSWER_ID, QA_TYPE_ID FROM N_RECORDED_QUERY WHERE SYSTEM_ID = 248 AND QUERY_ANSWER = '{}' """ # 需要更新的query sql_1 = """ SELECT ID,QUERY_ANSWER FROM N_RECORDED_QUERY WHERE SYSTEM_ID = 248 AND QUERY_TITLE = '{}' """ # 更新 sql_2 = """ UPDATE N_RECORDED_QUERY SET QUERY_ANSWER = '{}', ANSWER_ID = {}, QA_TYPE_ID = {} WHERE SYSTEM_ID = 248 AND ID = {} """ file = "/Users/cy/work/NLU/FAQ-341.xlsx" data = read_excel(file) # print(data) ID_list = [] for query, label in data.items(): print(query, label) new_label = label cursor.execute(sql.format(new_label)) results = cursor.fetchall() if len(results) > 0: new_QUERY_ANSWER = results[0]['QUERY_ANSWER'] new_ANSWER_ID = results[0]['ANSWER_ID'] new_QA_TYPE_ID = results[0]['QA_TYPE_ID'] # print(new_QUERY_ANSWER, new_ANSWER_ID, new_QA_TYPE_ID) cursor.execute(sql_1.format(query)) results = cursor.fetchall() if results: ori_ID = results[0]['ID'] ori_QUERY_ANSWER = results[0]['QUERY_ANSWER'] if ori_QUERY_ANSWER != new_QUERY_ANSWER: cursor.execute(sql_2.format(new_QUERY_ANSWER, new_ANSWER_ID, new_QA_TYPE_ID, ori_ID)) ID_list.append(ori_ID) connection.commit() for id in tqdm.tqdm(ID_list): update_es(248, id) connection.close() if __name__ == "__main__": # file = "/Users/cy/work/NLU/保险语料库清洗/清洗后.xlsx" # read_excel(file) # qingxi() insert_new_data()