SII_CIRCULARES / app.py
Waflon's picture
Update app.py
21b9e8f verified
import streamlit as st
import getpass
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.schema import StrOutputParser
os.environ["OPENAI_API_KEY"] = st.secrets['OPENAI'] # agregada en la config de hugginface
os.environ["LANGCHAIN_TRACING_V2"] = "true"
#os.environ["LANGCHAIN_API_KEY"] = st.secrets['OPENAI']
# Initialization
if 'chain' not in st.session_state:
st.session_state['chain'] = 'dummy'
def get_data():
return st.session_state["chain"]
def add_data(chain):
st.session_state["chain"]= chain
chain = get_data()
if chain == 'dummy':
loaders = [
PyPDFLoader("https://www.sii.cl/normativa_legislacion/circulares/2024/circu3.pdf"),
PyPDFLoader("https://www.sii.cl/normativa_legislacion/circulares/2024/circu2.pdf"),
PyPDFLoader("https://www.sii.cl/normativa_legislacion/circulares/2024/circu1.pdf"),
]
docs = []
for loader in loaders:
docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)
#Modelo QA sentence similarity
modelPath = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' #español
model_kwargs = {'device':'cpu'} # o cuda
encode_kwargs = {'normalize_embeddings': False}
#Embeddings que transforman a vectores densos multidimensionales las preguntas del SII
embeddings = HuggingFaceEmbeddings(
model_name=modelPath, # Ruta a modelo Pre entrenado
model_kwargs=model_kwargs, # Opciones de configuracion del modelo
encode_kwargs=encode_kwargs # Opciones de Encoding
)
#DB y retriever
db = FAISS.from_documents(docs, embeddings) # Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
retriever = db.as_retriever(search_kwargs={"k": 3})
template = """Responde la pregunta basado unicamente en el siguiente contexto
{contexto}
Pregunta: {pregunta}
"""
#LLM
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
def format_docs(docs):
return "\n\n".join([d.page_content for d in docs])
chain = (
{"contexto": retriever | format_docs, "pregunta": RunnablePassthrough()}
| prompt
| model
| StrOutputParser()
)
add_data(chain)
pregunta = st.text_input('Ingresa algun texto:', value="Has un resumen del documento circu3.pdf")
tmp_button = st.button("CLICK")
if tmp_button: #Esperar al boton
out = chain.invoke(pregunta)
st.write(out)
#st.rerun() #Restart app
else:
st.stop()