Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import getpass | |
| import os | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_core.runnables import RunnablePassthrough | |
| from langchain.schema import StrOutputParser | |
| os.environ["OPENAI_API_KEY"] = st.secrets['OPENAI'] # agregada en la config de hugginface | |
| os.environ["LANGCHAIN_TRACING_V2"] = "true" | |
| #os.environ["LANGCHAIN_API_KEY"] = st.secrets['OPENAI'] | |
| # Initialization | |
| if 'chain' not in st.session_state: | |
| st.session_state['chain'] = 'dummy' | |
| def get_data(): | |
| return st.session_state["chain"] | |
| def add_data(chain): | |
| st.session_state["chain"]= chain | |
| chain = get_data() | |
| if chain == 'dummy': | |
| loaders = [ | |
| PyPDFLoader("https://www.sii.cl/normativa_legislacion/circulares/2024/circu3.pdf"), | |
| PyPDFLoader("https://www.sii.cl/normativa_legislacion/circulares/2024/circu2.pdf"), | |
| PyPDFLoader("https://www.sii.cl/normativa_legislacion/circulares/2024/circu1.pdf"), | |
| ] | |
| docs = [] | |
| for loader in loaders: | |
| docs.extend(loader.load()) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000) | |
| docs = text_splitter.split_documents(docs) | |
| #Modelo QA sentence similarity | |
| modelPath = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' #español | |
| model_kwargs = {'device':'cpu'} # o cuda | |
| encode_kwargs = {'normalize_embeddings': False} | |
| #Embeddings que transforman a vectores densos multidimensionales las preguntas del SII | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name=modelPath, # Ruta a modelo Pre entrenado | |
| model_kwargs=model_kwargs, # Opciones de configuracion del modelo | |
| encode_kwargs=encode_kwargs # Opciones de Encoding | |
| ) | |
| #DB y retriever | |
| db = FAISS.from_documents(docs, embeddings) # Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents. | |
| retriever = db.as_retriever(search_kwargs={"k": 3}) | |
| template = """Responde la pregunta basado unicamente en el siguiente contexto | |
| {contexto} | |
| Pregunta: {pregunta} | |
| """ | |
| #LLM | |
| prompt = ChatPromptTemplate.from_template(template) | |
| model = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0) | |
| def format_docs(docs): | |
| return "\n\n".join([d.page_content for d in docs]) | |
| chain = ( | |
| {"contexto": retriever | format_docs, "pregunta": RunnablePassthrough()} | |
| | prompt | |
| | model | |
| | StrOutputParser() | |
| ) | |
| add_data(chain) | |
| pregunta = st.text_input('Ingresa algun texto:', value="Has un resumen del documento circu3.pdf") | |
| tmp_button = st.button("CLICK") | |
| if tmp_button: #Esperar al boton | |
| out = chain.invoke(pregunta) | |
| st.write(out) | |
| #st.rerun() #Restart app | |
| else: | |
| st.stop() |