import os import io import requests import logging from typing import Optional from agents import function_tool from docx import Document import PyPDF2 from .firebase_config import db # Set up logging logger = logging.getLogger(__name__) @function_tool def read_document_data(query: str, source: str = "auto") -> str: """ Read and search for information from documents stored locally or in Firebase Firestore. Args: query: The search query or topic to look for in the documents source: Data source - "local" for local files, "firestore" for Firebase, or "auto" to try both Returns: The relevant content from the document(s) matching the query """ logger.info(f"TOOL CALL: read_document_data called with query='{query}', source='{source}'") result = [] # Try local files first if source is "local" or "auto" if source in ["local", "auto"]: local_content = _read_local_documents(query) if local_content: result.append(f"=== Local Documents ===\n{local_content}") # Try Firestore if source is "firestore" or "auto" (and local didn't return results) if source in ["firestore", "auto"] and (not result or source == "firestore"): firestore_content = _read_firestore_documents(query) if firestore_content: result.append(f"=== Firestore Documents ===\n{firestore_content}") if result: response = "\n\n".join(result) logger.info(f"TOOL RESULT: read_document_data found {len(result)} result(s)") return response else: response = f"No relevant information found for query: '{query}'. Please check if documents are available." logger.info(f"TOOL RESULT: read_document_data found no results for query='{query}'") return response def _read_local_documents(query: str) -> Optional[str]: """Read from local PDF and DOCX files in the root directory.""" root_dir = os.path.dirname(os.path.dirname(__file__)) content_parts = [] # Try to read DOCX file docx_path = os.path.join(root_dir, "data.docx") if os.path.exists(docx_path): try: doc = Document(docx_path) full_text = [] for paragraph in doc.paragraphs: if paragraph.text.strip(): full_text.append(paragraph.text) docx_content = "\n".join(full_text) if docx_content: content_parts.append(f"[From data.docx]\n{docx_content}") except Exception as e: content_parts.append(f"Error reading data.docx: {str(e)}") # Try to read PDF files for file in os.listdir(root_dir): if file.endswith(".pdf"): pdf_path = os.path.join(root_dir, file) try: with open(pdf_path, "rb") as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) pdf_text = [] for page in pdf_reader.pages: text = page.extract_text() if text.strip(): pdf_text.append(text) if pdf_text: content_parts.append(f"[From {file}]\n" + "\n".join(pdf_text)) except Exception as e: content_parts.append(f"Error reading {file}: {str(e)}") return "\n\n".join(content_parts) if content_parts else None def _read_firestore_documents(query: str) -> Optional[str]: """Read documents from Firebase Firestore 'data' collection.""" if not db: return "Firebase Firestore is not initialized. Please check your serviceAccount.json file." try: # Query the 'data' collection docs_ref = db.collection("data") docs = docs_ref.stream() content_parts = [] for doc in docs: doc_data = doc.to_dict() # Check if document field contains a URL to a file document_url = doc_data.get("document") if document_url: # Download and read the document from URL try: doc_name = doc_data.get("name", doc.id) content = _read_document_from_url(document_url, doc_name) if content: content_parts.append(f"[From Firestore: {doc_name}]\n{content}") except Exception as e: content_parts.append(f"[Error reading {doc.id}]: {str(e)}") else: # Fallback: Try to extract content from different possible field names doc_content = ( doc_data.get("content") or doc_data.get("text") or doc_data.get("data") ) if doc_content: doc_name = doc_data.get("name", doc.id) content_parts.append(f"[From Firestore: {doc_name}]\n{doc_content}") return "\n\n".join(content_parts) if content_parts else None except Exception as e: return f"Error reading from Firestore: {str(e)}" def _read_document_from_url(url: str, doc_name: str) -> Optional[str]: """Download and read a document (DOCX or PDF) from a URL.""" try: # Download the file from URL response = requests.get(url, timeout=30) response.raise_for_status() # Determine file type from URL if url.lower().endswith('.docx') or 'docx' in url.lower(): # Read DOCX from bytes doc = Document(io.BytesIO(response.content)) full_text = [] for paragraph in doc.paragraphs: if paragraph.text.strip(): full_text.append(paragraph.text) return "\n".join(full_text) elif url.lower().endswith('.pdf') or 'pdf' in url.lower(): # Read PDF from bytes pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content)) pdf_text = [] for page in pdf_reader.pages: text = page.extract_text() if text.strip(): pdf_text.append(text) return "\n".join(pdf_text) else: return f"Unsupported file type for URL: {url}" except Exception as e: raise Exception(f"Failed to download/read document from {url}: {str(e)}") @function_tool def list_available_documents() -> str: """ List all available documents from both local storage and Firestore. Returns: A formatted list of available documents from all sources """ logger.info("TOOL CALL: list_available_documents called") result = [] # List local documents root_dir = os.path.dirname(os.path.dirname(__file__)) local_docs = [] if os.path.exists(os.path.join(root_dir, "data.docx")): local_docs.append("- data.docx") for file in os.listdir(root_dir): if file.endswith(".pdf"): local_docs.append(f"- {file}") if local_docs: result.append("=== Local Documents ===\n" + "\n".join(local_docs)) # List Firestore documents if db: try: docs_ref = db.collection("data") docs = docs_ref.stream() firestore_docs = [f"- {doc.id}" for doc in docs] if firestore_docs: result.append("=== Firestore Documents ===\n" + "\n".join(firestore_docs)) except Exception as e: result.append(f"Error listing Firestore documents: {str(e)}") response = "\n\n".join(result) if result else "No documents found in any source." logger.info(f"TOOL RESULT: list_available_documents found {len(result)} source(s) with documents") return response