File size: 3,076 Bytes
86f1c93
9e063a9
 
 
 
 
 
 
 
 
 
 
 
d2a4d18
53a6f75
b42a43f
e76e2e8
fc9959e
 
 
19563eb
e2eaff0
fc9959e
e2eaff0
 
9007e9c
fc9959e
e2eaff0
fbdee88
6fa3b56
3930a90
 
 
 
 
 
 
 
 
 
 
 
fcc8b9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73d1f72
15afc4b
21b9e8f
15afc4b
fcc8b9e
 
 
 
 
 
43cbe04
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import streamlit as st
import getpass
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.schema import StrOutputParser

os.environ["OPENAI_API_KEY"] = st.secrets['OPENAI']  # agregada en la config de hugginface
os.environ["LANGCHAIN_TRACING_V2"] = "true"
#os.environ["LANGCHAIN_API_KEY"] = st.secrets['OPENAI'] 

# Initialization
if 'chain' not in st.session_state:
    st.session_state['chain'] = 'dummy'

def get_data():
    return st.session_state["chain"]


def add_data(chain):
    st.session_state["chain"]= chain

chain = get_data()
if chain == 'dummy':
    loaders = [
        PyPDFLoader("https://www.sii.cl/normativa_legislacion/circulares/2024/circu3.pdf"),
        PyPDFLoader("https://www.sii.cl/normativa_legislacion/circulares/2024/circu2.pdf"),
        PyPDFLoader("https://www.sii.cl/normativa_legislacion/circulares/2024/circu1.pdf"),
    ]
    docs = []
    for loader in loaders:
        docs.extend(loader.load())
        
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
    docs = text_splitter.split_documents(docs)

    #Modelo QA sentence similarity
    modelPath = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' #español
    model_kwargs = {'device':'cpu'} # o cuda
    encode_kwargs = {'normalize_embeddings': False}
    
    #Embeddings que transforman a vectores densos multidimensionales las preguntas del SII
    embeddings = HuggingFaceEmbeddings(
        model_name=modelPath,     # Ruta a modelo Pre entrenado
        model_kwargs=model_kwargs, # Opciones de configuracion del modelo
        encode_kwargs=encode_kwargs # Opciones de Encoding
    )
    
    #DB y retriever
    db = FAISS.from_documents(docs, embeddings)  # Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.    
    retriever = db.as_retriever(search_kwargs={"k": 3})
    
    template = """Responde la pregunta basado unicamente en el siguiente contexto
    
    {contexto}
    
    Pregunta: {pregunta}
    
    """
    
    #LLM
    prompt = ChatPromptTemplate.from_template(template)
    model = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
    
    def format_docs(docs):
        return "\n\n".join([d.page_content for d in docs])
    
    chain = (
        {"contexto": retriever | format_docs, "pregunta": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )
    add_data(chain)


pregunta = st.text_input('Ingresa algun texto:', value="Has un resumen del documento circu3.pdf")
tmp_button = st.button("CLICK")
if tmp_button: #Esperar al boton
    out = chain.invoke(pregunta)
    st.write(out)
    #st.rerun() #Restart app
else:
    st.stop()