| |
| |
| |
| import os |
| from io import StringIO |
| import requests |
| import gradio as gr |
| import pandas as pd |
| import numpy as np |
| import openai |
| import tiktoken |
| |
| from openai.embeddings_utils import get_embedding, cosine_similarity |
| |
| |
| |
| from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings |
| from langchain.vectorstores import FAISS |
| from langchain.chat_models import ChatOpenAI |
| from langchain.memory import ConversationBufferMemory |
| from langchain.chains import ConversationalRetrievalChain |
| from langchain.llms import OpenAI, HuggingFaceHub |
| from langchain.chains.question_answering import load_qa_chain |
| |
| import json |
| import ast |
| |
| from langchain.schema import Document |
| |
| |
| |
| |
| |
| import gspread |
| from oauth2client.service_account import ServiceAccountCredentials |
| from datetime import datetime |
|
|
| |
| |
| |
| openai.api_key = os.getenv("OPENAI_API_KEY") |
| api_key = os.getenv("OPENAI_API_KEY") |
| token = os.getenv("token") |
| headers = { 'Authorization': f'token {token}', |
| 'Accept': 'application/vnd.github.v3.raw' } |
|
|
| |
| credentials = os.getenv( "credentials" ) |
| credentials = json.loads( credentials ) |
| gc = gspread.service_account_from_dict( credentials ) |
| Google_URL = os.getenv( "Google_Sheet" ) |
|
|
|
|
| |
| |
| |
| |
| url_tomos_conf_DPR = os.getenv("url_tomos_conf_DPR") |
| response_tomos_conf_DPR = requests.get( url_tomos_conf_DPR, headers = headers ) |
| csv_content_tomos_conf_DPR = response_tomos_conf_DPR.text |
| tomos_conf_DPR = pd.read_csv(StringIO( csv_content_tomos_conf_DPR )) |
|
|
| |
| url_tomos_conf_cita = os.getenv("url_tomos_conf_cita") |
| response_tomos_conf_cita = requests.get( url_tomos_conf_cita, headers = headers ) |
| csv_content_tomos_conf_cita = response_tomos_conf_cita.text |
| tomos_conf_cita = pd.read_csv(StringIO( csv_content_tomos_conf_cita )) |
|
|
| |
| url_df_tomos_1a28_01 = os.getenv("url_df_tomos_1a28_01") |
| response_df_tomos_1a28_01 = requests.get( url_df_tomos_1a28_01, headers = headers ) |
| csv_content_df_tomos_1a28_01 = response_df_tomos_1a28_01.text |
| df_tomos_1a28_01 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_01 )) |
|
|
| |
| url_df_tomos_1a28_02 = os.getenv("url_df_tomos_1a28_02") |
| response_df_tomos_1a28_02 = requests.get( url_df_tomos_1a28_02, headers = headers ) |
| csv_content_df_tomos_1a28_02 = response_df_tomos_1a28_02.text |
| df_tomos_1a28_02 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_02 )) |
|
|
| |
| url_df_tomos_1a28_03 = os.getenv("url_df_tomos_1a28_03") |
| response_df_tomos_1a28_03 = requests.get( url_df_tomos_1a28_03, headers = headers ) |
| csv_content_df_tomos_1a28_03 = response_df_tomos_1a28_03.text |
| df_tomos_1a28_03 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_03 )) |
|
|
| |
| url_df_tomos_1a28_04 = os.getenv("url_df_tomos_1a28_04") |
| response_df_tomos_1a28_04 = requests.get( url_df_tomos_1a28_04, headers = headers ) |
| csv_content_df_tomos_1a28_04 = response_df_tomos_1a28_04.text |
| df_tomos_1a28_04 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_04 )) |
|
|
| |
| url_df_tomos_1a28_05 = os.getenv("url_df_tomos_1a28_05") |
| response_df_tomos_1a28_05 = requests.get( url_df_tomos_1a28_05, headers = headers ) |
| csv_content_df_tomos_1a28_05 = response_df_tomos_1a28_05.text |
| df_tomos_1a28_05 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_05 )) |
|
|
| |
| url_df_tomos_1a28_06 = os.getenv("url_df_tomos_1a28_06") |
| response_df_tomos_1a28_06 = requests.get( url_df_tomos_1a28_06, headers = headers ) |
| csv_content_df_tomos_1a28_06 = response_df_tomos_1a28_06.text |
| df_tomos_1a28_06 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_06 )) |
|
|
| |
| url_df_tomos_1a28_07 = os.getenv("url_df_tomos_1a28_07") |
| response_df_tomos_1a28_07 = requests.get( url_df_tomos_1a28_07, headers = headers ) |
| csv_content_df_tomos_1a28_07 = response_df_tomos_1a28_07.text |
| df_tomos_1a28_07 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_07 )) |
|
|
| |
| url_df_tomos_1a28_08 = os.getenv("url_df_tomos_1a28_08") |
| response_df_tomos_1a28_08 = requests.get( url_df_tomos_1a28_08, headers = headers ) |
| csv_content_df_tomos_1a28_08 = response_df_tomos_1a28_08.text |
| df_tomos_1a28_08 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_08 )) |
|
|
| |
| url_df_tomos_1a28_09 = os.getenv("url_df_tomos_1a28_09") |
| response_df_tomos_1a28_09 = requests.get( url_df_tomos_1a28_09, headers = headers ) |
| csv_content_df_tomos_1a28_09 = response_df_tomos_1a28_09.text |
| df_tomos_1a28_09 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_09 )) |
|
|
| |
| df_tomos_1a28 = pd.concat([df_tomos_1a28_01, df_tomos_1a28_02], ignore_index = True) |
| df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_03], ignore_index = True) |
| df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_04], ignore_index = True) |
| df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_05], ignore_index = True) |
| df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_06], ignore_index = True) |
| df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_07], ignore_index = True) |
| df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_08], ignore_index = True) |
| df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_09], ignore_index = True) |
|
|
| |
| url_tercer_req = os.getenv("url_tercer_req") |
| response_tercer_req = requests.get( url_tercer_req, headers = headers ) |
| csv_content_tercer_req = response_tercer_req.text |
| tercer_req = pd.read_csv(StringIO( csv_content_tercer_req )) |
|
|
| |
| url_seg_req = os.getenv("url_seg_req") |
| response_seg_req = requests.get( url_seg_req, headers = headers ) |
| csv_content_seg_req = response_seg_req.text |
| seg_req = pd.read_csv(StringIO( csv_content_seg_req )) |
|
|
| |
| url_primer_req = os.getenv("url_primer_req") |
| response_primer_req = requests.get( url_primer_req, headers = headers ) |
| csv_content_primer_req = response_primer_req.text |
| primer_req = pd.read_csv(StringIO( csv_content_primer_req )) |
|
|
| |
| url_primer1_req = os.getenv("url_primer1_req") |
| response_primer1_req = requests.get( url_primer1_req, headers = headers ) |
| csv_content_primer1_req = response_primer1_req.text |
| primer1_req = pd.read_csv(StringIO( csv_content_primer1_req )) |
| primer1_req["Folder"] = "I. PRIMER REQUERIMIENTO (139)/2. Desahogo Reiteracion 1 139" |
|
|
| |
| url_primer2_req = os.getenv("url_primer2_req") |
| response_primer2_req = requests.get( url_primer2_req, headers = headers ) |
| csv_content_primer2_req = response_primer2_req.text |
| primer2_req = pd.read_csv(StringIO( csv_content_primer2_req )) |
| primer2_req["Folder"] = "I. PRIMER REQUERIMIENTO (139)/1. Desahogo RFI 139" |
|
|
| |
| |
| |
| def clean_and_parse_embedding(embedding_str): |
| |
| embedding_str = embedding_str.split('[')[-1].split(']')[0] |
| |
| embedding_list = ast.literal_eval(embedding_str) |
| return [float(val) for val in embedding_list] |
|
|
| tomos_conf_DPR['Embedding'] = tomos_conf_DPR['Embedding'].apply(clean_and_parse_embedding) |
| tomos_conf_cita['Embedding'] = tomos_conf_cita['Embedding'].apply(clean_and_parse_embedding) |
| tercer_req['Embedding'] = tercer_req['Embedding'].apply(clean_and_parse_embedding) |
| seg_req['Embedding'] = seg_req['Embedding'].apply(clean_and_parse_embedding) |
| primer_req['Embedding'] = primer_req['Embedding'].apply(clean_and_parse_embedding) |
| primer1_req['Embedding'] = primer1_req['Embedding'].apply(clean_and_parse_embedding) |
| primer2_req['Embedding'] = primer2_req['Embedding'].apply(clean_and_parse_embedding) |
|
|
| |
| |
| |
| def parse_embedding(embedding_str): |
| embedding_list = ast.literal_eval(embedding_str) |
| return [float(val) for val in embedding_list] |
|
|
| df_tomos_1a28['Embedding'] = df_tomos_1a28['Embedding'].apply(parse_embedding) |
|
|
| |
| |
| |
| list_of_dfs = [tomos_conf_DPR, tomos_conf_cita, df_tomos_1a28, tercer_req, seg_req, primer_req, primer1_req, primer2_req] |
|
|
| |
| |
| |
| def buscar(busqueda, lista_de_datos): |
| resultados = [] |
| busqueda_embed = get_embedding(busqueda, engine="text-embedding-ada-002") |
|
|
| for datos in lista_de_datos: |
| datos["similitud"] = datos['Embedding'].apply(lambda x: cosine_similarity(x, busqueda_embed)) |
| datos = datos.sort_values("similitud", ascending=False) |
| resultados.append(datos[['PDFName', 'PageNumber', 'similitud', "PageText", "Folder"]]) |
|
|
| |
| combined_result = pd.concat(resultados).sort_values("similitud", ascending=False).head(20) |
| return combined_result |
|
|
| |
| |
| |
| def buscar_ai(busqueda, lista_de_datos): |
| resultados = [] |
| busqueda_embed = get_embedding(busqueda, engine="text-embedding-ada-002") |
|
|
| for datos in lista_de_datos: |
| datos["similitud"] = datos['Embedding'].apply(lambda x: cosine_similarity(x, busqueda_embed)) |
| datos = datos.sort_values("similitud", ascending=False) |
| resultados.append(datos[['PDFName', 'PageNumber', 'similitud', "PageText", "Folder"]]) |
|
|
| |
| combined_result = pd.concat(resultados).sort_values("similitud", ascending=False).head(10) |
| return combined_result |
|
|
| |
| |
| |
| def count_text_extracted(pregunta): |
| df = buscar(pregunta, list_of_dfs) |
| pdf_counts = df.groupby(['Folder', 'PDFName'])['PageNumber'].count().reset_index() |
| |
| output_string = "" |
| for idx, row in pdf_counts.iterrows(): |
| folder_name = row['Folder'] |
| pdf_name = row['PDFName'] |
| count = row['PageNumber'] |
| page_numbers = df[(df['PDFName'] == pdf_name) & (df['Folder'] == folder_name)]['PageNumber'].tolist() |
| page_numbers_str = ', '.join(map(str, page_numbers)) |
| output_string += f"Usé el archivo '{pdf_name}' del folder '{folder_name}' {count} (vez/veces) al extraer el texto de las páginas {page_numbers_str}.\n\n" |
| |
| return output_string |
|
|
| |
| |
| |
|
|
| def print_pdf_info(pregunta): |
| df = buscar(pregunta, list_of_dfs) |
| |
| output_string = "" |
| |
| for _, row in df.iterrows(): |
| pdf_name = row['PDFName'] |
| page_number = row['PageNumber'] |
| page_text = row['PageText'] |
| |
| |
| indented_page_text = '\n'.join(['\t' + line for line in page_text.split('\n')]) |
| |
| |
| output_string += f'De "{pdf_name}":\n \tPágina {page_number}:\n\t {indented_page_text}\n' |
| |
| return output_string |
|
|
| |
| |
| |
| def vector_document(dataframe): |
| string_vectors = dataframe["PageText"] |
| documents = [Document(page_content=content, metadata={'id': i}) for i, content in enumerate(string_vectors)] |
| return documents |
|
|
| |
| |
| |
| def info_pdf(pregunta): |
| df = buscar(pregunta, list_of_dfs) |
| |
| output_list = [] |
| |
| for _, row in df.iterrows(): |
| pdf_name = row['PDFName'] |
| page_number = row['PageNumber'] |
| page_text = row['PageText'] |
| |
| |
| indented_page_text = '\n'.join(['\t' + line for line in page_text.split('\n')]) |
| |
| |
| output_list.append(f'De "{pdf_name}": Página {page_number}: {indented_page_text}') |
| |
| return output_list |
|
|
| def get_completion_from_messages( messages, model = "gpt-3.5-turbo-16k", |
| temperature = 0, max_tokens = 4500 ): |
| response = openai.ChatCompletion.create( |
| model = model, |
| messages = messages, |
| temperature = temperature, |
| max_tokens = max_tokens, |
| ) |
| return response.choices[0].message["content"] |
|
|
| def get_topic( user_message ): |
| |
| delimiter = "####" |
| system_message = f""" |
| Eres un abogado que trabaja en temas de competencia económica e investiga casos en México. |
| Siempre intenarás responder en el mayor número posible de palabras. |
| Las consultas o preguntas se delimitarán con los caracteres {delimiter} |
| """ |
| |
| messages = [ |
| {'role':'system', |
| 'content': system_message}, |
| {'role':'user', |
| 'content': f"{delimiter}{user_message}{delimiter}"}, |
| ] |
| return get_completion_from_messages( messages ) |
|
|
| def get_respuesta( user_message, informacion): |
| |
| delimiter = "####" |
| system_message = f""" |
| Eres un abogado que trabaja en temas de competencia económica e investiga casos en México. |
| Siempre intenarás responder en el mayor número posible de palabras. |
| Las consultas o preguntas se delimitarán con los caracteres {delimiter} |
| |
| """ |
| |
| messages = [ |
| {'role':'system', |
| 'content': system_message}, |
| {'role':'user', |
| 'content': f""" |
| {delimiter} |
| Estás intentando recopilar información relevante para tu caso. |
| Usa exclusivamente la información contenida en la siguiente lista: |
| {informacion} |
| |
| para responder sin límite de palabras lo siguiente: {user_message} |
| Responde de forma detallada. |
| {delimiter} |
| """}, |
| ] |
| |
| return get_completion_from_messages(messages) |
|
|
| def update_records( user_message ): |
| |
| sht = gc.open_by_url(Google_URL) |
| |
| sht.worksheet("Hoja 2").get_all_records() |
| |
| sht.worksheet("Hoja 2").update_cell( len( sht.worksheet("Hoja 2").get_all_records()[:] ) + 2 , |
| 1 , datetime.now().strftime("%m/%d/%Y, %H:%M:%S") ) |
| |
| sht.worksheet("Hoja 2").update_cell( len( sht.worksheet("Hoja 2").get_all_records()[:] ) + 1 , |
| 2 , user_message ) |
|
|
| def chat(user_message_1): |
| |
| norma_y_tema_response_1 = get_topic(user_message_1) |
| norma_y_tema_response_1 += 'Todos' |
| uno = buscar_ai(user_message_1, list_of_dfs) |
| lista_info = uno['PageText'].tolist() |
| |
| |
| update_records( user_message_1 ) |
| |
| return get_respuesta(user_message_1, lista_info) |
|
|
| |
| with gr.Blocks() as demo: |
| txt = gr.Textbox(label="Texto", lines=2) |
| btn = gr.Button(value="Listo") |
| txt_2 = gr.Textbox(value="", label="Donde (top 20):") |
| txt_3 = gr.Textbox(value="", label="Extractos (top 20):") |
| txt_1 = gr.Textbox(value="", label="Respuesta IA:") |
| btn.click(chat, inputs=[txt], outputs=[txt_1]) |
| btn.click(count_text_extracted, inputs=[txt], outputs=[txt_2]) |
| btn.click(print_pdf_info, inputs=[txt], outputs=[txt_3]) |
|
|
| if __name__ == "__main__": |
| demo.launch(share=True) |