Spaces:
Configuration error
Configuration error
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| import pandas as pd | |
| import PyPDF2 | |
| import os | |
| from typing import List, Dict | |
| class DocumentRetrieverTool: | |
| def __init__(self): | |
| self.name = "document_retriever" | |
| self.description = "Retrieves relevant text from GAIA text-heavy files (CSV, TXT, PDF) using semantic search." | |
| self.inputs = { | |
| "task_id": {"type": "string", "description": "GAIA task ID for the file"}, | |
| "query": {"type": "string", "description": "Question or query to search for"}, | |
| "file_type": {"type": "string", "description": "File type (csv, txt, pdf, default: txt)"} | |
| } | |
| self.output_type = str | |
| self.embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=50, | |
| length_function=len | |
| ) | |
| self.chunks: List[str] = [] | |
| self.embeddings: np.ndarray = None | |
| async def aparse(self, task_id: str, query: str, file_type: str = "txt") -> str: | |
| """ | |
| Loads a GAIA file, splits it into chunks, embeds them, and retrieves relevant text for the query. | |
| Supports CSV, TXT, and PDF files. | |
| """ | |
| try: | |
| file_path = f"temp_{task_id}.{file_type}" | |
| if not os.path.exists(file_path): | |
| return f"File not found for task ID {task_id}" | |
| # Load and preprocess file | |
| text = "" | |
| if file_type == "csv": | |
| df = pd.read_csv(file_path) | |
| text = df.to_string() | |
| elif file_type == "txt": | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| elif file_type == "pdf": | |
| with open(file_path, "rb") as f: | |
| pdf = PyPDF2.PdfReader(f) | |
| text = "".join(page.extract_text() or "" for page in pdf.pages) | |
| else: | |
| return f"Unsupported file type: {file_type}" | |
| # Check if text was extracted | |
| if not text.strip(): | |
| return "No extractable text found in file." | |
| # Split text into chunks | |
| self.chunks = self.text_splitter.split_text(text) | |
| if not self.chunks: | |
| return "No content found in file." | |
| # Embed chunks and query | |
| self.embeddings = self.embedder.encode(self.chunks, convert_to_tensor=True) | |
| query_embedding = self.embedder.encode(query, convert_to_tensor=True) | |
| # Compute cosine similarities | |
| from sentence_transformers import util | |
| similarities = util.cos_sim(query_embedding, self.embeddings)[0] | |
| # Get top 3 most relevant chunks | |
| top_k = min(3, len(self.chunks)) | |
| top_indices = similarities.argsort(descending=True)[:top_k] | |
| relevant_chunks = [self.chunks[idx] for idx in top_indices] | |
| # Combine results | |
| return "\n\n".join(relevant_chunks) | |
| except Exception as e: | |
| return f"Error retrieving documents: {str(e)}" | |
| document_retriever_tool = DocumentRetrieverTool() | |