segoedu commited on
Commit
7b15149
·
verified ·
1 Parent(s): 9254c9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +230 -98
app.py CHANGED
@@ -1,118 +1,250 @@
1
- import streamlit as st
2
  import os
 
 
3
 
4
- from groq import Groq
5
  from PyPDF2 import PdfReader
6
- from datetime import datetime
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
- #from langchain.vectorstores import FAISS
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_groq import ChatGroq
12
- #from langchain.chat_models import ChatOpenAI
13
- from langchain.chains.question_answering import load_qa_chain
14
-
15
- st.set_page_config('Lectorín')
16
- st.header("Pregunta a tu PDF")
17
- GROQ_API_KEY = st.text_input('Groq API Key', value="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", type='password')
18
- pdf_obj = st.file_uploader("Carga tu documento", type="pdf", on_change=st.cache_resource.clear)
19
- modelos = {
20
- 'multi, 512, 0.47G, 384 - intfloat/multilingual-e5-small': ('intfloat/multilingual-e5-small',512),
21
- 'multi, 256, 0.08G, 384 - multi-qa-MiniLM-L6-cos-v1': ('multi-qa-MiniLM-L6-cos-v1',256),
22
- 'multi,8192, 2.27G,1024 - BAAI/bge-m3': ('BAAI/bge-m3', 8192),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
- modelo = st.selectbox('Modelo de embedding', list(modelos.keys()))
25
- modelo_embeddings, sequence = modelos[modelo]
26
- chunk_size = sequence * 5 # en español, de media una palabra tiene 5 caracteres
27
-
28
- modelos_llm = [
29
- 'llama3-70b-8192',
30
- 'llama3-8b-8192',
31
- 'mixtral-8x7b-32768',
32
- 'gemma-7b-it'
33
- ]
34
- modelo_llm = st.selectbox('Modelo de lenguaje', list(modelos_llm))
35
-
36
- # Langsmith
37
- os.environ["LANGCHAIN_TRACING_V2"] = "true"
38
- os.environ["LANGCHAIN_API_KEY"] = "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"
39
- os.environ["LANGCHAIN_PROJECT"] = "qpdf"
40
-
41
-
42
- def save_to_file():
43
- with open("historial.txt", "a", encoding="utf-8") as archivo:
44
- # Añadir la fecha y hora actual
45
- archivo.write("-" * 25 )
46
- fecha_hora_actual = datetime.now().strftime("%Y-%m-%d %H:%M")
47
- archivo.write(f" {fecha_hora_actual} ")
48
- archivo.write(f" ({file_name}) ")
49
- archivo.write("-" * 25 + "\n")
50
- # Guardar preguntas
51
- archivo.write(f"Pregunta: {user_question}\n")
52
- # Guardar respuestas
53
- archivo.write(f"Respuesta: {respuesta}\n")
54
-
55
-
56
- @st.cache_resource
57
- def create_embeddings(pdf):
58
- pdf_reader = PdfReader(pdf)
59
- text = ""
60
- for page in pdf_reader.pages:
61
- text += page.extract_text()
62
-
63
- text_splitter = RecursiveCharacterTextSplitter(
64
- chunk_size=chunk_size,
65
- chunk_overlap=150,
66
- length_function=len
67
- )
68
 
69
- chunks = text_splitter.split_text(text)
70
- embeddings = HuggingFaceEmbeddings(model_name=modelo_embeddings)
71
- knowledge_base = FAISS.from_texts(chunks, embeddings)
 
 
 
 
 
 
 
 
72
 
73
- return knowledge_base
 
 
74
 
 
 
75
 
76
- # Función para mostrar logs
77
- def mostrar_logs(logs,hints):
78
- # Crear un contenedor desplegable
79
- with st.expander("Chunks"):
80
- for hint in hints:
81
- st.write(hint.page_content)
82
- st.write("-" * 30)
83
 
84
- st.sidebar.header("Registro de preguntas")
85
- for entry in logs:
86
- st.sidebar.write(f"**Pregunta: {entry['Pregunta']}**")
87
- st.sidebar.write(f"Respuesta: {entry['Respuesta']}")
 
 
 
 
 
 
 
88
 
89
 
90
- # Lista para almacenar preguntas y respuestas
91
- logs = []
 
92
 
93
- if pdf_obj:
94
- file_name = pdf_obj.name
95
- knowledge_base = create_embeddings(pdf_obj)
96
- user_question = st.text_input("¡A jugar! Haz una pregunta sobre tu PDF:")
97
 
98
- if user_question:
99
- os.environ["GROQ_API_KEY"] = GROQ_API_KEY
100
- #os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
101
- docs = knowledge_base.similarity_search(user_question, 5)
102
- llm = ChatGroq(groq_api_key = os.getenv('GROQ_API_KEY'),model = modelo_llm)
103
- #llm = ChatOpenAI(model_name='gpt-3.5-turbo')
104
- chain = load_qa_chain(llm, chain_type="stuff")
105
- respuesta = chain.run(input_documents=docs, question=user_question)
 
 
106
 
107
- # Mostrar la variable en color verde
108
- st.subheader("Respuesta")
109
- st.write(f":green[{str(respuesta)}]")
 
110
 
111
- # Guardar pregunta y respuesta en los logs
112
- logs.append({"Pregunta": user_question, "Respuesta": respuesta})
 
 
113
 
114
- # Mostrar logs actualizados
115
- mostrar_logs(logs,docs)
116
 
117
- # Guarda la consulta en un archivo
118
- save_to_file()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ from datetime import datetime
3
+ from pathlib import Path
4
 
5
+ import streamlit as st
6
  from PyPDF2 import PdfReader
7
+
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from langchain_community.embeddings import HuggingFaceEmbeddings
11
  from langchain_community.vectorstores import FAISS
12
  from langchain_groq import ChatGroq
13
+ from langchain.chains import create_retrieval_chain
14
+ from langchain.chains.combine_documents import create_stuff_documents_chain
15
+
16
+
17
+ # -------------------------
18
+ # Configuración general
19
+ # -------------------------
20
+ st.set_page_config(
21
+ page_title="Lectorín",
22
+ page_icon="📄",
23
+ layout="wide"
24
+ )
25
+
26
+ st.title("📄 Lectorín 2026")
27
+ st.caption("Pregunta a tu PDF con RAG, FAISS y Groq")
28
+
29
+ # Secrets / env vars
30
+ # Preferencia:
31
+ # 1) st.secrets["GROQ_API_KEY"]
32
+ # 2) variable de entorno GROQ_API_KEY
33
+ GROQ_API_KEY = st.secrets.get("GROQ_API_KEY", os.getenv("GROQ_API_KEY", ""))
34
+
35
+ # LangSmith opcional
36
+ LANGCHAIN_API_KEY = st.secrets.get("LANGCHAIN_API_KEY", os.getenv("LANGCHAIN_API_KEY", ""))
37
+ if LANGCHAIN_API_KEY:
38
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
39
+ os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
40
+ os.environ["LANGCHAIN_PROJECT"] = "qpdf-2026"
41
+
42
+ # Carpeta de datos local
43
+ DATA_DIR = Path("data")
44
+ DATA_DIR.mkdir(exist_ok=True)
45
+ HISTORIAL_PATH = DATA_DIR / "historial.txt"
46
+
47
+
48
+ # -------------------------
49
+ # Estado de sesión
50
+ # -------------------------
51
+ if "logs" not in st.session_state:
52
+ st.session_state.logs = []
53
+
54
+ if "knowledge_base" not in st.session_state:
55
+ st.session_state.knowledge_base = None
56
+
57
+ if "current_pdf_name" not in st.session_state:
58
+ st.session_state.current_pdf_name = None
59
+
60
+
61
+ # -------------------------
62
+ # Modelos
63
+ # -------------------------
64
+ modelos_embeddings = {
65
+ "multilingual-e5-small (rápido)": ("intfloat/multilingual-e5-small", 512),
66
+ "multi-qa-MiniLM-L6-cos-v1 (ligero)": ("multi-qa-MiniLM-L6-cos-v1", 256),
67
+ "bge-m3 (mejor multilingüe, más pesado)": ("BAAI/bge-m3", 2048),
68
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ modelos_llm = {
71
+ "Llama 3.3 70B Versatile": "llama-3.3-70b-versatile",
72
+ }
73
+
74
+ with st.sidebar:
75
+ st.header("Configuración")
76
+ embedding_label = st.selectbox("Modelo de embeddings", list(modelos_embeddings.keys()))
77
+ embedding_model_name, sequence = modelos_embeddings[embedding_label]
78
+
79
+ llm_label = st.selectbox("Modelo LLM", list(modelos_llm.keys()))
80
+ llm_model_name = modelos_llm[llm_label]
81
 
82
+ k_docs = st.slider("Chunks recuperados", min_value=2, max_value=8, value=4)
83
+ chunk_size = st.slider("Chunk size", min_value=500, max_value=3000, value=min(sequence * 4, 2000), step=100)
84
+ chunk_overlap = st.slider("Chunk overlap", min_value=50, max_value=400, value=150, step=25)
85
 
86
+ st.divider()
87
+ st.write("Para producción, configura `GROQ_API_KEY` en secretos o variables de entorno.")
88
 
 
 
 
 
 
 
 
89
 
90
+ # -------------------------
91
+ # Utilidades
92
+ # -------------------------
93
+ def extract_text_from_pdf(uploaded_file) -> str:
94
+ reader = PdfReader(uploaded_file)
95
+ pages = []
96
+ for page in reader.pages:
97
+ text = page.extract_text() or ""
98
+ if text.strip():
99
+ pages.append(text)
100
+ return "\n\n".join(pages)
101
 
102
 
103
+ @st.cache_resource(show_spinner=False)
104
+ def load_embeddings_model(model_name: str):
105
+ return HuggingFaceEmbeddings(model_name=model_name)
106
 
 
 
 
 
107
 
108
+ @st.cache_data(show_spinner=False)
109
+ def split_text_to_chunks(text: str, chunk_size: int, chunk_overlap: int):
110
+ splitter = RecursiveCharacterTextSplitter(
111
+ chunk_size=chunk_size,
112
+ chunk_overlap=chunk_overlap,
113
+ length_function=len,
114
+ separators=["\n\n", "\n", ". ", " ", ""]
115
+ )
116
+ return splitter.split_text(text)
117
+
118
 
119
+ def build_knowledge_base(uploaded_file, embedding_model_name: str, chunk_size: int, chunk_overlap: int):
120
+ text = extract_text_from_pdf(uploaded_file)
121
+ if not text.strip():
122
+ raise ValueError("No se pudo extraer texto del PDF.")
123
 
124
+ chunks = split_text_to_chunks(text, chunk_size, chunk_overlap)
125
+ embeddings = load_embeddings_model(embedding_model_name)
126
+ vectorstore = FAISS.from_texts(chunks, embeddings)
127
+ return vectorstore, len(chunks)
128
 
 
 
129
 
130
+ def save_to_file(file_name: str, question: str, answer: str):
131
+ with open(HISTORIAL_PATH, "a", encoding="utf-8") as f:
132
+ fecha_hora_actual = datetime.now().strftime("%Y-%m-%d %H:%M")
133
+ f.write("-" * 25)
134
+ f.write(f" {fecha_hora_actual} ")
135
+ f.write(f" ({file_name}) ")
136
+ f.write("-" * 25 + "\n")
137
+ f.write(f"Pregunta: {question}\n")
138
+ f.write(f"Respuesta: {answer}\n\n")
139
+
140
+
141
+ def build_rag_chain(vectorstore, groq_api_key: str, model_name: str, k: int = 4):
142
+ retriever = vectorstore.as_retriever(search_kwargs={"k": k})
143
+
144
+ llm = ChatGroq(
145
+ groq_api_key=groq_api_key,
146
+ model=model_name,
147
+ temperature=0
148
+ )
149
+
150
+ prompt = ChatPromptTemplate.from_messages([
151
+ (
152
+ "system",
153
+ "Responde usando solo el contexto recuperado. "
154
+ "Si la respuesta no está en el documento, di claramente que no aparece en el PDF. "
155
+ "Contesta en español y de forma precisa.\n\nContexto:\n{context}"
156
+ ),
157
+ ("human", "{input}")
158
+ ])
159
+
160
+ qa_chain = create_stuff_documents_chain(llm, prompt)
161
+ rag_chain = create_retrieval_chain(retriever, qa_chain)
162
+ return rag_chain
163
+
164
+
165
+ def render_logs():
166
+ with st.sidebar:
167
+ st.subheader("Historial de preguntas")
168
+ if not st.session_state.logs:
169
+ st.caption("Todavía no hay preguntas.")
170
+ else:
171
+ for i, entry in enumerate(reversed(st.session_state.logs), start=1):
172
+ with st.expander(f"{i}. {entry['Pregunta'][:60]}"):
173
+ st.write(entry["Respuesta"])
174
+
175
+
176
+ # -------------------------
177
+ # Interfaz principal
178
+ # -------------------------
179
+ pdf_obj = st.file_uploader("Carga tu documento PDF", type="pdf")
180
+
181
+ if pdf_obj is not None:
182
+ if st.session_state.current_pdf_name != pdf_obj.name:
183
+ st.session_state.current_pdf_name = pdf_obj.name
184
+ st.session_state.logs = []
185
+ st.session_state.knowledge_base = None
186
+
187
+ col1, col2 = st.columns([1, 1])
188
+
189
+ with col1:
190
+ if st.button("Procesar PDF", type="primary", use_container_width=True):
191
+ with st.spinner("Procesando PDF y creando índice vectorial..."):
192
+ try:
193
+ kb, n_chunks = build_knowledge_base(
194
+ pdf_obj,
195
+ embedding_model_name,
196
+ chunk_size,
197
+ chunk_overlap
198
+ )
199
+ st.session_state.knowledge_base = kb
200
+ st.success(f"PDF procesado correctamente. Chunks generados: {n_chunks}")
201
+ except Exception as e:
202
+ st.error(f"Error procesando el PDF: {e}")
203
+
204
+ with col2:
205
+ if st.session_state.knowledge_base is not None:
206
+ st.success("Base vectorial lista.")
207
+ else:
208
+ st.info("Sube un PDF y pulsa 'Procesar PDF'.")
209
+
210
+ if not GROQ_API_KEY:
211
+ st.warning("Falta GROQ_API_KEY. Añádela en Streamlit secrets o en variables de entorno.")
212
+ elif st.session_state.knowledge_base is not None:
213
+ user_question = st.text_input("Haz una pregunta sobre tu PDF")
214
+
215
+ if user_question:
216
+ with st.spinner("Consultando el documento..."):
217
+ try:
218
+ rag_chain = build_rag_chain(
219
+ st.session_state.knowledge_base,
220
+ GROQ_API_KEY,
221
+ llm_model_name,
222
+ k=k_docs
223
+ )
224
+ result = rag_chain.invoke({"input": user_question})
225
+ answer = result["answer"]
226
+ context_docs = result.get("context", [])
227
+
228
+ st.subheader("Respuesta")
229
+ st.write(answer)
230
+
231
+ with st.expander("Ver fragmentos recuperados"):
232
+ if context_docs:
233
+ for i, doc in enumerate(context_docs, start=1):
234
+ st.markdown(f"**Chunk {i}**")
235
+ st.write(doc.page_content)
236
+ st.markdown("---")
237
+ else:
238
+ st.caption("No se devolvieron fragmentos.")
239
+
240
+ st.session_state.logs.append({
241
+ "Pregunta": user_question,
242
+ "Respuesta": answer
243
+ })
244
+
245
+ save_to_file(pdf_obj.name, user_question, answer)
246
+
247
+ except Exception as e:
248
+ st.error(f"Error al consultar el PDF: {e}")
249
+
250
+ render_logs()