File size: 11,821 Bytes
980f81a
4a86350
4ee4e2e
7f73e5e
 
 
 
aeb1724
55153f6
 
 
55a7931
 
cce1726
2e80bf6
4a86350
70601ba
2d5c108
70601ba
2b15de9
55153f6
1c8cf6c
 
 
55153f6
1c8cf6c
 
55a7931
 
1ccf917
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55153f6
 
3bc496b
b28944c
e884da5
 
 
 
 
 
 
 
c51a9b2
e884da5
 
 
dbf30b9
b28944c
e884da5
b28944c
2bda19d
b28944c
 
2bda19d
c51a9b2
2bda19d
78b225d
2bda19d
 
 
 
 
 
c0baed4
c51a9b2
2bda19d
c51a9b2
b28944c
87575d2
 
3bc496b
87575d2
 
 
 
 
 
 
f0f639a
 
aeb1724
 
f0f639a
aeb1724
f0f639a
db0e4d2
aeb1724
db0e4d2
aeb1724
 
db0e4d2
f0f639a
aeb1724
 
db0e4d2
aeb1724
 
f0f639a
db0e4d2
 
aeb1724
f0f639a
db0e4d2
aeb1724
db0e4d2
aeb1724
db0e4d2
 
aeb1724
db0e4d2
 
 
 
aeb1724
 
 
db0e4d2
 
 
 
aeb1724
 
 
 
db0e4d2
 
 
 
 
aeb1724
 
 
87575d2
9fc2b4c
 
4765456
9fc2b4c
 
 
 
 
 
 
 
 
 
 
 
 
 
4765456
2b15de9
89082d2
9fc2b4c
df8dd91
4765456
9fc2b4c
 
 
 
2b15de9
 
 
4765456
2b15de9
 
 
 
 
 
 
9fc2b4c
 
 
2c2b401
08b169e
 
 
 
 
9fc2b4c
 
 
08b169e
2c2b401
9fc2b4c
1254511
 
 
7e25ccc
1254511
 
3e88fba
1254511
 
7e25ccc
1254511
7e25ccc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1254511
2c2b401
 
 
c0baed4
87575d2
2eb9acb
 
dbf30b9
2eb9acb
 
e884da5
3bc496b
e884da5
 
3bc496b
 
 
e884da5
c0baed4
 
 
2eb9acb
48d30ee
2eb9acb
48d30ee
 
c0baed4
48d30ee
 
 
2eb9acb
c0baed4
 
b963f56
c0baed4
48d30ee
2eb9acb
c0baed4
2eb9acb
b963f56
 
 
 
 
 
 
 
c0baed4
dbf30b9
c0baed4
dbf30b9
3bc496b
dbf30b9
 
 
 
 
 
 
 
 
b963f56
 
dbf30b9
c0baed4
b963f56
2eb9acb
c0baed4
b963f56
 
 
 
48d30ee
b963f56
dbf30b9
 
 
b963f56
dbf30b9
 
48d30ee
c0baed4
dbf30b9
48d30ee
 
 
2eb9acb
c0baed4
dbf30b9
b28944c
8370383
 
70601ba
2d5c108
 
 
 
 
 
 
 
 
 
 
 
7047942
2d5c108
 
 
 
 
7047942
 
2d5c108
 
 
7047942
2d5c108
70601ba
2d5c108
 
 
 
 
 
 
 
 
 
 
 
 
 
7047942
2d5c108
 
7047942
 
2d5c108
 
 
 
 
 
 
70601ba
 
19dc6e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
from mcp.server.fastmcp import FastMCP, Context
from datetime import datetime
from llama_index.core import VectorStoreIndex
from llama_index.core import (
    StorageContext,
    load_index_from_storage,
)
from llama_index.tools.arxiv import ArxivToolSpec
from llama_index.core import Settings
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from typing import Optional, List, Dict, Any
from pathlib import Path
import json
import os
import aiohttp  # Necesario para las peticiones HTTP asíncronas

import asyncio

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)



##### OPENAI #####
# Context:
openai_api_key = os.environ.get('OPENAI_API_KEY')


llm = OpenAI(
    model="gpt-4.1",
    api_key=openai_api_key,
)
embed_model = OpenAIEmbedding(
    model="text-embedding-ada-002",
    api_key=openai_api_key,
)

Settings.llm = llm
Settings.embed_model = embed_model




# Configuración de paths
DOCUMENTS_BASE_PATH = "./"
RETRIEVERS_JSON_PATH = Path("./retrievers.json")

# Cargar metadatos
def load_retrievers_metadata():
    try:
        with open(RETRIEVERS_JSON_PATH, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        logger.error(f"Error cargando retrievers.json: {str(e)}", exc_info=True)
        return {}

retrievers_metadata = load_retrievers_metadata()
SOURCES = {source: f"{source.lower()}/" for source in retrievers_metadata.keys()}

# Cargar índices
indices: Dict[str, VectorStoreIndex] = {}

for source, rel_path in SOURCES.items():
    full_path = os.path.join(DOCUMENTS_BASE_PATH, rel_path)
    if not os.path.exists(full_path):
        logger.warning(f"No se encontró la ruta para {source}")
        continue

    for root, dirs, files in os.walk(full_path):
        if "storage_nodes" in dirs:
            try:
                storage_path = os.path.join(root, "storage_nodes")
                storage_context = StorageContext.from_defaults(persist_dir=storage_path)
                index_name = os.path.basename(root)
                indices[index_name] = load_index_from_storage(storage_context) #, index_id="vector_index"
                logger.info(f"Índice cargado correctamente: {index_name}")
            except Exception as e:
                logger.error(f"Error cargando índice {index_name}: {str(e)}", exc_info=True)




            


port = int(os.getenv("PORT", 7860))
mcp = FastMCP("OnBase", port=port)


arxiv_tool = ArxivToolSpec(max_results=5).to_tool_list()[0]
arxiv_tool.return_direct = True

@mcp.tool()
async def search_arxiv(
    query: str,
    max_results: int = 5
) -> Dict[str, Any]:
    """
    Busca artículos académicos en ArXiv.
    
    Args:
        query: Términos de búsqueda (ej. "deep learning")
        max_results: Número máximo de resultados (1-10, default 5)
    
    Returns:
        Dict: Resultados de la búsqueda con metadatos de los papers
    """
    try:
        # Configurar máximo de resultados
        max_results = min(max(1, max_results), 10)
        arxiv_tool.metadata.max_results = max_results
        
        # Ejecutar búsqueda y obtener resultados
        tool_output = arxiv_tool(query=query)
        
        # Procesar documentos
        papers = []
        for doc in tool_output.raw_output:  # Acceder correctamente a los documentos
            content = doc.text_resource.text.split('\n')
            papers.append({
                'title': content[0].split(': ')[1] if ': ' in content[0] else content[0],
                'abstract': '\n'.join(content[1:]).strip(),
                'pdf_url': content[0].split(': ')[0].replace('http://', 'https://'),
                'arxiv_id': content[0].split(': ')[0].split('/')[-1].replace('v1', '')
            })
        
        return {
            'papers': papers,
            'count': len(papers),
            'query': query,
            'status': 'success'
        }
        
    except Exception as e:
        return {
            'papers': [],
            'count': 0,
            'query': query,
            'status': 'error',
            'error': str(e)
        }



@mcp.tool()
async def list_retrievers(source: str = None) -> dict:
    """
    Devuelve la lista de retrievers disponibles. 
    Si se especifica una source y existe, filtra por ella; si no existe, devuelve todas.
    
    Args:
        source (str, optional): Fuente para filtrar. Si no existe, se ignorará. Defaults to None.
    
    Returns:
        dict: {
            "retrievers": Lista de retrievers (filtrados o completos),
            "count": Número total,
            "status": "success"|"error",
            "source_requested": source,  # Muestra lo que se solicitó
            "source_used": "all"|source  # Muestra lo que realmente se usó
        }
    """
    try:
        available = []
        source_exists = source in retrievers_metadata if source else False
        
        for current_source, indexes in retrievers_metadata.items():
            # Solo filtrar si el source existe, sino mostrar todo
            if source_exists and current_source != source:
                continue
                
            for index_name, metadata in indexes.items():
                available.append({
                    "name": index_name,
                    "source": current_source,
                    "title": metadata.get("title", ""),
                    "description": metadata.get("description", "")
                })
        
        return {
            "retrievers": available,
            "count": len(available),
            "status": "success",
            "source_requested": source,
            "source_used": source if source_exists else "all"
        }
    except Exception as e:
        return {
            "retrievers": [],
            "count": 0,
            "status": "error",
            "error": str(e),
            "source_requested": source,
            "source_used": "none"
        }


@mcp.tool()
async def list_retrievers(ctx: Context, source: str = None) -> dict:
    """
    Devuelve la lista de retrievers disponibles, opcionalmente filtrados por source.
    
    Args:
        source (str, optional): Fuente para filtrar. Default None.
    
    Returns:
        dict: Lista de retrievers con metadatos.
    """
    # Obtenemos todos los retrievers del resource
    result = await ctx.resources.read("data://retrievers/list")
    
    # Si hay error en el resource, lo propagamos
    if result.get("status") == "error":
        return result
    
    # Filtramos por source si se especificó
    if source:
        filtered = [r for r in result["retrievers"] if r["source"] == source]
        return {
            "retrievers": filtered,
            "count": len(filtered),
            "status": "success",
            "source_requested": source,
            "source_used": source if filtered else "none"
        }
    
    return {
        **result,
        "source_requested": None,
        "source_used": "all"
    }




# Función de retrievers
@mcp.tool()
def retrieve_docs(
    query: str,
    retrievers: List[str],
    top_k: int = 3
) -> dict:
    """
    Realiza búsqueda semántica en documentos indexados.
    
    Parámetros:
        query (str): Texto de búsqueda (requerido)
        retrievers (List[str]): Nombres de retrievers a consultar (requerido)
        top_k (int): Número de resultados por retriever (opcional, default=3)
    """
    logger.info(f"Iniciando búsqueda para query: '{query}'")
    logger.debug(f"Parámetros - retrievers: {retrievers}, top_k: {top_k}")
    
    results = {}
    invalid = []
    
    for name in retrievers:
        if name not in indices:
            logger.warning(f"Retriever no encontrado: {name}")
            invalid.append(name)
            continue
            
        try:
            logger.info(f"Procesando retriever: {name}")
            
            # 1. Obtener el índice y realizar la búsqueda
            logger.debug(f"Creando retriever para {name} con top_k={top_k}")
            retriever = indices[name].as_retriever(similarity_top_k=top_k)
            nodes = retriever.retrieve(query)
            logger.info(f"Retrieved {len(nodes)} documentos de {name}")
            
            # 2. Buscar metadatos COMPLETOS
            metadata = {}
            source = "unknown"
            for src, indexes in retrievers_metadata.items():
                if name in indexes:
                    metadata = indexes[name]
                    source = src
                    break
            logger.debug(f"Metadatos encontrados para {name}: {metadata.keys()}")
            
            # 3. Construir respuesta
            results[name] = {
                "title": metadata.get("title", name),
                "documents": [
                    {
                        "content": node.get_content(),
                        "metadata": node.metadata,
                        "score": node.score
                    }
                    for node in nodes
                ],
                "description": metadata.get("description", ""),
                "source": source,
                "last_updated": metadata.get("last_updated", "")
            }
            logger.info(f"Retriever {name} procesado exitosamente")
            
        except Exception as e:
            logger.error(f"Error procesando retriever {name}: {str(e)}", exc_info=True)
            results[name] = {
                "error": str(e),
                "retriever": name
            }
    
    # Construir respuesta final
    response = {
        "query": query,
        "results": results,
        "top_k": top_k,
    }
    
    if invalid:
        logger.warning(f"Retrievers inválidos: {invalid}. Opciones válidas: {list(indices.keys())}")
        response["warnings"] = {
            "invalid_retrievers": invalid,
            "valid_options": list(indices.keys())
        }
    
    logger.info(f"Búsqueda completada. Total resultados: {len(results)}")
    return response

    

@mcp.tool()
async def search_tavily(
    query: str,
    days: int = 7,
    max_results: int = 1,
    include_answer: bool = False
) -> dict:
    """Perform a web search using the Tavily API.
    
    Args:
        query: Search query string (required)
        days: Restrict search to last N days (default: 7)
        max_results: Maximum results to return (default: 1)
        include_answer: Include a direct answer only when requested by the user (default: False)
        
    Returns:
        dict: Search results from Tavily
    """
    # Obtener la API key de las variables de entorno
    tavily_api_key = os.environ.get('TAVILY_API_KEY')
    if not tavily_api_key:
        raise ValueError("TAVILY_API_KEY environment variable not set")
    
    headers = {
        "Authorization": f"Bearer {tavily_api_key}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "query": query,
        "search_depth": "basic",
        "max_results": max_results,
        "days": days if days else None,
        "include_answer": include_answer
    }
    
    try:
        async with aiohttp.ClientSession() as session:
            async with session.post(
                "https://api.tavily.com/search",
                headers=headers,
                json=payload
            ) as response:
                response.raise_for_status()
                result = await response.json()
                return result
                
    except Exception as e:
        return {
            "error": str(e),
            "status": "failed",
            "query": query
        }

if __name__ == "__main__":
    mcp.run("sse")