geronimo-pericoli commited on
Commit
97520b6
·
verified ·
1 Parent(s): db47b33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -72
app.py CHANGED
@@ -46,26 +46,26 @@ Settings.embed_model = embed_model
46
  DOCUMENTS_BASE_PATH = "./"
47
  RETRIEVERS_JSON_PATH = Path("./retrievers.json")
48
 
49
- # Cargar metadatos
50
  def load_retrievers_metadata():
51
  try:
52
  with open(RETRIEVERS_JSON_PATH, 'r', encoding='utf-8') as f:
53
  return json.load(f)
54
  except Exception as e:
55
- print(f"Error cargando retrievers.json: {str(e)}")
56
- print(f"Detalles del error: {traceback.format_exc()}") # Necesitarías importar traceback
57
  return {}
58
 
59
  retrievers_metadata = load_retrievers_metadata()
60
  SOURCES = {source: f"{source.lower()}/" for source in retrievers_metadata.keys()}
61
 
62
- # Cargar índices
63
  indices: Dict[str, VectorStoreIndex] = {}
64
 
65
  for source, rel_path in SOURCES.items():
66
  full_path = os.path.join(DOCUMENTS_BASE_PATH, rel_path)
67
  if not os.path.exists(full_path):
68
- print(f"Advertencia: No se encontró la ruta para {source}")
69
  continue
70
 
71
  for root, dirs, files in os.walk(full_path):
@@ -75,10 +75,10 @@ for source, rel_path in SOURCES.items():
75
  storage_context = StorageContext.from_defaults(persist_dir=storage_path)
76
  index_name = os.path.basename(root)
77
  indices[index_name] = load_index_from_storage(storage_context) #, index_id="vector_index"
78
- print(f"Índice cargado correctamente: {index_name}")
79
  except Exception as e:
80
- print(f"Error cargando índice {index_name}: {str(e)}")
81
- print(f"Detalles del error: {traceback.format_exc()}")
82
 
83
 
84
 
@@ -98,26 +98,26 @@ async def search_arxiv(
98
  max_results: int = 5
99
  ) -> Dict[str, Any]:
100
  """
101
- Busca artículos académicos en ArXiv.
102
 
103
  Args:
104
- query: Términos de búsqueda (ej. "deep learning")
105
- max_results: Número máximo de resultados (1-10, default 5)
106
 
107
  Returns:
108
- Dict: Resultados de la búsqueda con metadatos de los papers
109
  """
110
  try:
111
- # Configurar máximo de resultados
112
  max_results = min(max(1, max_results), 10)
113
  arxiv_tool.metadata.max_results = max_results
114
 
115
- # Ejecutar búsqueda y obtener resultados
116
  tool_output = arxiv_tool(query=query)
117
 
118
- # Procesar documentos
119
  papers = []
120
- for doc in tool_output.raw_output: # Acceder correctamente a los documentos
121
  content = doc.text_resource.text.split('\n')
122
  papers.append({
123
  'title': content[0].split(': ')[1] if ': ' in content[0] else content[0],
@@ -144,19 +144,19 @@ async def search_arxiv(
144
 
145
  async def list_retrievers(source: str = None) -> dict:
146
  """
147
- Devuelve la lista de retrievers disponibles.
148
- Si se especifica una source y existe, filtra por ella; si no existe, devuelve todas.
149
 
150
  Args:
151
- source (str, optional): Fuente para filtrar. Si no existe, se ignorará. Defaults to None.
152
 
153
  Returns:
154
  dict: {
155
- "retrievers": Lista de retrievers (filtrados o completos),
156
- "count": Número total,
157
  "status": "success"|"error",
158
- "source_requested": source, # Muestra lo que se solicitó
159
- "source_used": "all"|source # Muestra lo que realmente se usó
160
  }
161
  """
162
  try:
@@ -164,7 +164,7 @@ async def list_retrievers(source: str = None) -> dict:
164
  source_exists = source in retrievers_metadata if source else False
165
 
166
  for current_source, indexes in retrievers_metadata.items():
167
- # Solo filtrar si el source existe, sino mostrar todo
168
  if source_exists and current_source != source:
169
  continue
170
 
@@ -200,32 +200,32 @@ def retrieve_docs(
200
  top_k: int = 3
201
  ) -> dict:
202
  """
203
- Realiza búsqueda semántica en documentos indexados.
204
 
205
- Parámetros:
206
- query (str): Texto de búsqueda (requerido)
207
- retrievers (List[str]): Nombres de retrievers a consultar (requerido)
208
- top_k (int): Número de resultados por retriever (opcional, default=3)
209
  """
210
- print(f"Iniciando búsqueda para query: '{query}'")
211
- print(f"Parámetros - retrievers: {retrievers}, top_k: {top_k}")
212
 
213
  results = {}
214
  invalid = []
215
 
216
  for name in retrievers:
217
  if name not in indices:
218
- print(f"Retriever no encontrado: {name}")
219
  invalid.append(name)
220
  continue
221
 
222
  try:
223
- print(f"Procesando retriever: {name}")
224
  retriever = indices[name].as_retriever(similarity_top_k=top_k)
225
  nodes = retriever.retrieve(query)
226
- print(f"Retrieved {len(nodes)} documentos de {name}")
227
 
228
- # 2. Buscar metadatos COMPLETOS
229
  metadata = {}
230
  source = "unknown"
231
  for src, indexes in retrievers_metadata.items():
@@ -233,9 +233,9 @@ def retrieve_docs(
233
  metadata = indexes[name]
234
  source = src
235
  break
236
- print(f"Metadatos encontrados para {name}: {metadata.keys()}")
237
 
238
- # 3. Construir respuesta
239
  results[name] = {
240
  "title": metadata.get("title", name),
241
  "documents": [
@@ -250,16 +250,16 @@ def retrieve_docs(
250
  "source": source,
251
  "last_updated": metadata.get("last_updated", "")
252
  }
253
- print(f"Retriever {name} procesado exitosamente")
254
 
255
  except Exception as e:
256
- print(f"Error procesando retriever {name}: {str(e)}", exc_info=True)
257
  results[name] = {
258
  "error": str(e),
259
  "retriever": name
260
  }
261
 
262
- # Construir respuesta final
263
  response = {
264
  "query": query,
265
  "results": results,
@@ -267,13 +267,13 @@ def retrieve_docs(
267
  }
268
 
269
  if invalid:
270
- print(f"Retrievers inválidos: {invalid}. Opciones válidas: {list(indices.keys())}")
271
  response["warnings"] = {
272
  "invalid_retrievers": invalid,
273
  "valid_options": list(indices.keys())
274
  }
275
 
276
- print(f"Búsqueda completada. Total resultados: {len(results)}")
277
  return response
278
 
279
 
@@ -294,7 +294,7 @@ async def search_tavily(
294
  Returns:
295
  dict: Search results from Tavily
296
  """
297
- # Obtener la API key de las variables de entorno
298
  tavily_api_key = os.environ.get('TAVILY_API_KEY')
299
  if not tavily_api_key:
300
  raise ValueError("TAVILY_API_KEY environment variable not set")
@@ -340,66 +340,66 @@ async def search_tavily(
340
 
341
 
342
  # Gradio interface
343
- with gr.Blocks(title="Herramientas MCP", theme=gr.themes.Base()) as arxiv_tab:
344
  arxiv_interface = gr.Interface(
345
  fn=search_arxiv,
346
  inputs=[
347
- gr.Textbox(label="Términos de búsqueda", placeholder="Ej: deep learning"),
348
- gr.Slider(1, 10, value=5, step=1, label="Número máximo de resultados")
349
  ],
350
- outputs=gr.JSON(label="Resultados de búsqueda"),
351
- title="Búsqueda en ArXiv",
352
- description="Busca artículos académicos en ArXiv por palabras clave.",
353
  api_name="_search_arxiv"
354
  )
355
 
356
- with gr.Blocks(title="Herramientas MCP", theme=gr.themes.Base()) as list_retrievers_tab:
357
  retrievers_interface = gr.Interface(
358
  fn=list_retrievers,
359
- inputs=gr.Textbox(label="Fuente (opcional)", placeholder="Dejar vacío para listar todos"),
360
- outputs=gr.JSON(label="Lista de retrievers"),
361
- title="Lista de Retrievers",
362
- description="Muestra los retrievers disponibles, opcionalmente filtrados por fuente.",
363
  api_name="_list_retrievers"
364
  )
365
 
366
- with gr.Blocks(title="Herramientas MCP", theme=gr.themes.Base()) as tavily_tab:
367
  tavily_interface = gr.Interface(
368
  fn=search_tavily,
369
  inputs=[
370
- gr.Textbox(label="Consulta de búsqueda", placeholder="Ej: últimas noticias sobre IA"),
371
- gr.Slider(1, 30, value=7, step=1, label="Últimos N días (0 para sin límite)"),
372
- gr.Slider(1, 10, value=1, step=1, label="Máximo de resultados"),
373
- gr.Checkbox(label="Incluir respuesta directa", value=False)
374
  ],
375
- outputs=gr.JSON(label="Resultados de Tavily"),
376
- title="Búsqueda Web (Tavily)",
377
- description="Realiza búsquedas en web usando la API de Tavily.",
378
  api_name="_search_tavily"
379
  )
380
 
381
- with gr.Blocks(title="Herramientas MCP", theme=gr.themes.Base()) as retrieve_tab:
382
- # Interfaz para retrieve_docs
383
  retrieve_interface = gr.Interface(
384
  fn=retrieve_docs,
385
  inputs=[
386
- gr.Textbox(label="Consulta", placeholder="Ingrese su pregunta o términos de búsqueda..."),
387
  gr.Dropdown(
388
  choices=list(indices.keys()),
389
  label="Retrievers",
390
  multiselect=True,
391
- info="Seleccione uno o más retrievers"
392
  ),
393
- gr.Slider(1, 10, value=3, step=1, label="Número de resultados por retriever (top_k)")
394
  ],
395
- outputs=gr.JSON(label="Resultados de búsqueda semántica"),
396
- title="Búsqueda Semántica en Documentos",
397
- description="""Realiza búsqueda semántica en documentos indexados usando retrievers.
398
- Seleccione los retrievers disponibles y ajuste el número de resultados.""",
399
  api_name="_retrieve"
400
  )
401
 
402
- # Creamos la interfaz con las pestañas separadas
403
  demo = gr.TabbedInterface(
404
  [arxiv_tab, tavily_tab, list_retrievers_tab, retrieve_tab],
405
  ["ArXiv", "Tavily", "List Retrievers", "Retrieve"]
 
46
  DOCUMENTS_BASE_PATH = "./"
47
  RETRIEVERS_JSON_PATH = Path("./retrievers.json")
48
 
49
+ # Load metadata
50
  def load_retrievers_metadata():
51
  try:
52
  with open(RETRIEVERS_JSON_PATH, 'r', encoding='utf-8') as f:
53
  return json.load(f)
54
  except Exception as e:
55
+ print(f"Error loading retrievers.json: {str(e)}")
56
+ print(f"Error details: {traceback.format_exc()}") # You would need to import traceback
57
  return {}
58
 
59
  retrievers_metadata = load_retrievers_metadata()
60
  SOURCES = {source: f"{source.lower()}/" for source in retrievers_metadata.keys()}
61
 
62
+ # Load indexes
63
  indices: Dict[str, VectorStoreIndex] = {}
64
 
65
  for source, rel_path in SOURCES.items():
66
  full_path = os.path.join(DOCUMENTS_BASE_PATH, rel_path)
67
  if not os.path.exists(full_path):
68
+ print(f"Warning: Path not found for {source}")
69
  continue
70
 
71
  for root, dirs, files in os.walk(full_path):
 
75
  storage_context = StorageContext.from_defaults(persist_dir=storage_path)
76
  index_name = os.path.basename(root)
77
  indices[index_name] = load_index_from_storage(storage_context) #, index_id="vector_index"
78
+ print(f"Index loaded successfully: {index_name}")
79
  except Exception as e:
80
+ print(f"Error loading index {index_name}: {str(e)}")
81
+ print(f"Error details: {traceback.format_exc()}")
82
 
83
 
84
 
 
98
  max_results: int = 5
99
  ) -> Dict[str, Any]:
100
  """
101
+ Searches for academic papers on ArXiv.
102
 
103
  Args:
104
+ query: Search terms (e.g. "deep learning")
105
+ max_results: Maximum number of results (1-10, default 5)
106
 
107
  Returns:
108
+ Dict: Search results with paper metadata
109
  """
110
  try:
111
+ # Configure maximum results
112
  max_results = min(max(1, max_results), 10)
113
  arxiv_tool.metadata.max_results = max_results
114
 
115
+ # Execute search and get results
116
  tool_output = arxiv_tool(query=query)
117
 
118
+ # Process documents
119
  papers = []
120
+ for doc in tool_output.raw_output: # Correctly access documents
121
  content = doc.text_resource.text.split('\n')
122
  papers.append({
123
  'title': content[0].split(': ')[1] if ': ' in content[0] else content[0],
 
144
 
145
  async def list_retrievers(source: str = None) -> dict:
146
  """
147
+ Returns the list of available retrievers.
148
+ If a source is specified and exists, filters by it; if it doesn't exist, returns all.
149
 
150
  Args:
151
+ source (str, optional): Source to filter by. If it doesn't exist, it will be ignored. Defaults to None.
152
 
153
  Returns:
154
  dict: {
155
+ "retrievers": List of retrievers (filtered or complete),
156
+ "count": Total count,
157
  "status": "success"|"error",
158
+ "source_requested": source, # Shows what was requested
159
+ "source_used": "all"|source # Shows what was actually used
160
  }
161
  """
162
  try:
 
164
  source_exists = source in retrievers_metadata if source else False
165
 
166
  for current_source, indexes in retrievers_metadata.items():
167
+ # Only filter if source exists, otherwise show all
168
  if source_exists and current_source != source:
169
  continue
170
 
 
200
  top_k: int = 3
201
  ) -> dict:
202
  """
203
+ Performs semantic search on indexed documents.
204
 
205
+ Parameters:
206
+ query (str): Search text (required)
207
+ retrievers (List[str]): Names of retrievers to query (required)
208
+ top_k (int): Number of results per retriever (optional, default=3)
209
  """
210
+ print(f"Starting search for query: '{query}'")
211
+ print(f"Parameters - retrievers: {retrievers}, top_k: {top_k}")
212
 
213
  results = {}
214
  invalid = []
215
 
216
  for name in retrievers:
217
  if name not in indices:
218
+ print(f"Retriever not found: {name}")
219
  invalid.append(name)
220
  continue
221
 
222
  try:
223
+ print(f"Processing retriever: {name}")
224
  retriever = indices[name].as_retriever(similarity_top_k=top_k)
225
  nodes = retriever.retrieve(query)
226
+ print(f"Retrieved {len(nodes)} documents from {name}")
227
 
228
+ # 2. Search for COMPLETE metadata
229
  metadata = {}
230
  source = "unknown"
231
  for src, indexes in retrievers_metadata.items():
 
233
  metadata = indexes[name]
234
  source = src
235
  break
236
+ print(f"Metadata found for {name}: {metadata.keys()}")
237
 
238
+ # 3. Build response
239
  results[name] = {
240
  "title": metadata.get("title", name),
241
  "documents": [
 
250
  "source": source,
251
  "last_updated": metadata.get("last_updated", "")
252
  }
253
+ print(f"Retriever {name} processed successfully")
254
 
255
  except Exception as e:
256
+ print(f"Error processing retriever {name}: {str(e)}", exc_info=True)
257
  results[name] = {
258
  "error": str(e),
259
  "retriever": name
260
  }
261
 
262
+ # Build final response
263
  response = {
264
  "query": query,
265
  "results": results,
 
267
  }
268
 
269
  if invalid:
270
+ print(f"Invalid retrievers: {invalid}. Valid options: {list(indices.keys())}")
271
  response["warnings"] = {
272
  "invalid_retrievers": invalid,
273
  "valid_options": list(indices.keys())
274
  }
275
 
276
+ print(f"Search completed. Total results: {len(results)}")
277
  return response
278
 
279
 
 
294
  Returns:
295
  dict: Search results from Tavily
296
  """
297
+ # Get API key from environment variables
298
  tavily_api_key = os.environ.get('TAVILY_API_KEY')
299
  if not tavily_api_key:
300
  raise ValueError("TAVILY_API_KEY environment variable not set")
 
340
 
341
 
342
  # Gradio interface
343
+ with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as arxiv_tab:
344
  arxiv_interface = gr.Interface(
345
  fn=search_arxiv,
346
  inputs=[
347
+ gr.Textbox(label="Search terms", placeholder="E.g.: deep learning"),
348
+ gr.Slider(1, 10, value=5, step=1, label="Maximum number of results")
349
  ],
350
+ outputs=gr.JSON(label="Search results"),
351
+ title="ArXiv Search",
352
+ description="Search for academic papers on ArXiv using keywords.",
353
  api_name="_search_arxiv"
354
  )
355
 
356
+ with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as list_retrievers_tab:
357
  retrievers_interface = gr.Interface(
358
  fn=list_retrievers,
359
+ inputs=gr.Textbox(label="Source (optional)", placeholder="Leave empty to list all"),
360
+ outputs=gr.JSON(label="List of retrievers"),
361
+ title="List of Retrievers",
362
+ description="Shows available retrievers, optionally filtered by source.",
363
  api_name="_list_retrievers"
364
  )
365
 
366
+ with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as tavily_tab:
367
  tavily_interface = gr.Interface(
368
  fn=search_tavily,
369
  inputs=[
370
+ gr.Textbox(label="Search query", placeholder="E.g.: latest news about AI"),
371
+ gr.Slider(1, 30, value=7, step=1, label="Last N days (0 for no limit)"),
372
+ gr.Slider(1, 10, value=1, step=1, label="Maximum results"),
373
+ gr.Checkbox(label="Include direct answer", value=False)
374
  ],
375
+ outputs=gr.JSON(label="Tavily results"),
376
+ title="Web Search (Tavily)",
377
+ description="Perform web searches using the Tavily API.",
378
  api_name="_search_tavily"
379
  )
380
 
381
+ with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as retrieve_tab:
382
+ # Interface for retrieve_docs
383
  retrieve_interface = gr.Interface(
384
  fn=retrieve_docs,
385
  inputs=[
386
+ gr.Textbox(label="Query", placeholder="Enter your question or search terms..."),
387
  gr.Dropdown(
388
  choices=list(indices.keys()),
389
  label="Retrievers",
390
  multiselect=True,
391
+ info="Select one or more retrievers"
392
  ),
393
+ gr.Slider(1, 10, value=3, step=1, label="Number of results per retriever (top_k)")
394
  ],
395
+ outputs=gr.JSON(label="Semantic search results"),
396
+ title="Semantic Document Search",
397
+ description="""Perform semantic search on indexed documents using retrievers.
398
+ Select available retrievers and adjust the number of results.""",
399
  api_name="_retrieve"
400
  )
401
 
402
+ # Create the interface with separate tabs
403
  demo = gr.TabbedInterface(
404
  [arxiv_tab, tavily_tab, list_retrievers_tab, retrieve_tab],
405
  ["ArXiv", "Tavily", "List Retrievers", "Retrieve"]