Spaces:
Runtime error
Runtime error
File size: 7,915 Bytes
eb60b56 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | import os
import io
import requests
import logging
from typing import Optional
from agents import function_tool
from docx import Document
import PyPDF2
from .firebase_config import db
# Set up logging
logger = logging.getLogger(__name__)
@function_tool
def read_document_data(query: str, source: str = "auto") -> str:
"""
Read and search for information from documents stored locally or in Firebase Firestore.
Args:
query: The search query or topic to look for in the documents
source: Data source - "local" for local files, "firestore" for Firebase, or "auto" to try both
Returns:
The relevant content from the document(s) matching the query
"""
logger.info(f"TOOL CALL: read_document_data called with query='{query}', source='{source}'")
result = []
# Try local files first if source is "local" or "auto"
if source in ["local", "auto"]:
local_content = _read_local_documents(query)
if local_content:
result.append(f"=== Local Documents ===\n{local_content}")
# Try Firestore if source is "firestore" or "auto" (and local didn't return results)
if source in ["firestore", "auto"] and (not result or source == "firestore"):
firestore_content = _read_firestore_documents(query)
if firestore_content:
result.append(f"=== Firestore Documents ===\n{firestore_content}")
if result:
response = "\n\n".join(result)
logger.info(f"TOOL RESULT: read_document_data found {len(result)} result(s)")
return response
else:
response = f"No relevant information found for query: '{query}'. Please check if documents are available."
logger.info(f"TOOL RESULT: read_document_data found no results for query='{query}'")
return response
def _read_local_documents(query: str) -> Optional[str]:
"""Read from local PDF and DOCX files in the root directory."""
root_dir = os.path.dirname(os.path.dirname(__file__))
content_parts = []
# Try to read DOCX file
docx_path = os.path.join(root_dir, "data.docx")
if os.path.exists(docx_path):
try:
doc = Document(docx_path)
full_text = []
for paragraph in doc.paragraphs:
if paragraph.text.strip():
full_text.append(paragraph.text)
docx_content = "\n".join(full_text)
if docx_content:
content_parts.append(f"[From data.docx]\n{docx_content}")
except Exception as e:
content_parts.append(f"Error reading data.docx: {str(e)}")
# Try to read PDF files
for file in os.listdir(root_dir):
if file.endswith(".pdf"):
pdf_path = os.path.join(root_dir, file)
try:
with open(pdf_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
pdf_text = []
for page in pdf_reader.pages:
text = page.extract_text()
if text.strip():
pdf_text.append(text)
if pdf_text:
content_parts.append(f"[From {file}]\n" + "\n".join(pdf_text))
except Exception as e:
content_parts.append(f"Error reading {file}: {str(e)}")
return "\n\n".join(content_parts) if content_parts else None
def _read_firestore_documents(query: str) -> Optional[str]:
"""Read documents from Firebase Firestore 'data' collection."""
if not db:
return "Firebase Firestore is not initialized. Please check your serviceAccount.json file."
try:
# Query the 'data' collection
docs_ref = db.collection("data")
docs = docs_ref.stream()
content_parts = []
for doc in docs:
doc_data = doc.to_dict()
# Check if document field contains a URL to a file
document_url = doc_data.get("document")
if document_url:
# Download and read the document from URL
try:
doc_name = doc_data.get("name", doc.id)
content = _read_document_from_url(document_url, doc_name)
if content:
content_parts.append(f"[From Firestore: {doc_name}]\n{content}")
except Exception as e:
content_parts.append(f"[Error reading {doc.id}]: {str(e)}")
else:
# Fallback: Try to extract content from different possible field names
doc_content = (
doc_data.get("content") or
doc_data.get("text") or
doc_data.get("data")
)
if doc_content:
doc_name = doc_data.get("name", doc.id)
content_parts.append(f"[From Firestore: {doc_name}]\n{doc_content}")
return "\n\n".join(content_parts) if content_parts else None
except Exception as e:
return f"Error reading from Firestore: {str(e)}"
def _read_document_from_url(url: str, doc_name: str) -> Optional[str]:
"""Download and read a document (DOCX or PDF) from a URL."""
try:
# Download the file from URL
response = requests.get(url, timeout=30)
response.raise_for_status()
# Determine file type from URL
if url.lower().endswith('.docx') or 'docx' in url.lower():
# Read DOCX from bytes
doc = Document(io.BytesIO(response.content))
full_text = []
for paragraph in doc.paragraphs:
if paragraph.text.strip():
full_text.append(paragraph.text)
return "\n".join(full_text)
elif url.lower().endswith('.pdf') or 'pdf' in url.lower():
# Read PDF from bytes
pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
pdf_text = []
for page in pdf_reader.pages:
text = page.extract_text()
if text.strip():
pdf_text.append(text)
return "\n".join(pdf_text)
else:
return f"Unsupported file type for URL: {url}"
except Exception as e:
raise Exception(f"Failed to download/read document from {url}: {str(e)}")
@function_tool
def list_available_documents() -> str:
"""
List all available documents from both local storage and Firestore.
Returns:
A formatted list of available documents from all sources
"""
logger.info("TOOL CALL: list_available_documents called")
result = []
# List local documents
root_dir = os.path.dirname(os.path.dirname(__file__))
local_docs = []
if os.path.exists(os.path.join(root_dir, "data.docx")):
local_docs.append("- data.docx")
for file in os.listdir(root_dir):
if file.endswith(".pdf"):
local_docs.append(f"- {file}")
if local_docs:
result.append("=== Local Documents ===\n" + "\n".join(local_docs))
# List Firestore documents
if db:
try:
docs_ref = db.collection("data")
docs = docs_ref.stream()
firestore_docs = [f"- {doc.id}" for doc in docs]
if firestore_docs:
result.append("=== Firestore Documents ===\n" + "\n".join(firestore_docs))
except Exception as e:
result.append(f"Error listing Firestore documents: {str(e)}")
response = "\n\n".join(result) if result else "No documents found in any source."
logger.info(f"TOOL RESULT: list_available_documents found {len(result)} source(s) with documents")
return response
|