File size: 7,915 Bytes
cb3f557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import os
import io
import requests
import logging
from typing import Optional
from agents import function_tool
from docx import Document
import PyPDF2
from .firebase_config import db

# Set up logging
logger = logging.getLogger(__name__)


@function_tool
def read_document_data(query: str, source: str = "auto") -> str:
    """
    Read and search for information from documents stored locally or in Firebase Firestore.
    
    Args:
        query: The search query or topic to look for in the documents
        source: Data source - "local" for local files, "firestore" for Firebase, or "auto" to try both
    
    Returns:
        The relevant content from the document(s) matching the query
    """
    logger.info(f"TOOL CALL: read_document_data called with query='{query}', source='{source}'")
    
    result = []
    
    # Try local files first if source is "local" or "auto"
    if source in ["local", "auto"]:
        local_content = _read_local_documents(query)
        if local_content:
            result.append(f"=== Local Documents ===\n{local_content}")
    
    # Try Firestore if source is "firestore" or "auto" (and local didn't return results)
    if source in ["firestore", "auto"] and (not result or source == "firestore"):
        firestore_content = _read_firestore_documents(query)
        if firestore_content:
            result.append(f"=== Firestore Documents ===\n{firestore_content}")
    
    if result:
        response = "\n\n".join(result)
        logger.info(f"TOOL RESULT: read_document_data found {len(result)} result(s)")
        return response
    else:
        response = f"No relevant information found for query: '{query}'. Please check if documents are available."
        logger.info(f"TOOL RESULT: read_document_data found no results for query='{query}'")
        return response

def _read_local_documents(query: str) -> Optional[str]:
    """Read from local PDF and DOCX files in the root directory."""
    root_dir = os.path.dirname(os.path.dirname(__file__))
    content_parts = []
    
    # Try to read DOCX file
    docx_path = os.path.join(root_dir, "data.docx")
    if os.path.exists(docx_path):
        try:
            doc = Document(docx_path)
            full_text = []
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    full_text.append(paragraph.text)
            
            docx_content = "\n".join(full_text)
            if docx_content:
                content_parts.append(f"[From data.docx]\n{docx_content}")
        except Exception as e:
            content_parts.append(f"Error reading data.docx: {str(e)}")
    
    # Try to read PDF files
    for file in os.listdir(root_dir):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(root_dir, file)
            try:
                with open(pdf_path, "rb") as pdf_file:
                    pdf_reader = PyPDF2.PdfReader(pdf_file)
                    pdf_text = []
                    for page in pdf_reader.pages:
                        text = page.extract_text()
                        if text.strip():
                            pdf_text.append(text)
                    
                    if pdf_text:
                        content_parts.append(f"[From {file}]\n" + "\n".join(pdf_text))
            except Exception as e:
                content_parts.append(f"Error reading {file}: {str(e)}")
    
    return "\n\n".join(content_parts) if content_parts else None


def _read_firestore_documents(query: str) -> Optional[str]:
    """Read documents from Firebase Firestore 'data' collection."""
    if not db:
        return "Firebase Firestore is not initialized. Please check your serviceAccount.json file."
    
    try:
        # Query the 'data' collection
        docs_ref = db.collection("data")
        docs = docs_ref.stream()
        
        content_parts = []
        for doc in docs:
            doc_data = doc.to_dict()
            
            # Check if document field contains a URL to a file
            document_url = doc_data.get("document")
            
            if document_url:
                # Download and read the document from URL
                try:
                    doc_name = doc_data.get("name", doc.id)
                    content = _read_document_from_url(document_url, doc_name)
                    if content:
                        content_parts.append(f"[From Firestore: {doc_name}]\n{content}")
                except Exception as e:
                    content_parts.append(f"[Error reading {doc.id}]: {str(e)}")
            else:
                # Fallback: Try to extract content from different possible field names
                doc_content = (
                    doc_data.get("content") or 
                    doc_data.get("text") or 
                    doc_data.get("data")
                )
                
                if doc_content:
                    doc_name = doc_data.get("name", doc.id)
                    content_parts.append(f"[From Firestore: {doc_name}]\n{doc_content}")
        
        return "\n\n".join(content_parts) if content_parts else None
    
    except Exception as e:
        return f"Error reading from Firestore: {str(e)}"


def _read_document_from_url(url: str, doc_name: str) -> Optional[str]:
    """Download and read a document (DOCX or PDF) from a URL."""
    try:
        # Download the file from URL
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        # Determine file type from URL
        if url.lower().endswith('.docx') or 'docx' in url.lower():
            # Read DOCX from bytes
            doc = Document(io.BytesIO(response.content))
            full_text = []
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    full_text.append(paragraph.text)
            return "\n".join(full_text)
        
        elif url.lower().endswith('.pdf') or 'pdf' in url.lower():
            # Read PDF from bytes
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
            pdf_text = []
            for page in pdf_reader.pages:
                text = page.extract_text()
                if text.strip():
                    pdf_text.append(text)
            return "\n".join(pdf_text)
        
        else:
            return f"Unsupported file type for URL: {url}"
    
    except Exception as e:
        raise Exception(f"Failed to download/read document from {url}: {str(e)}")


@function_tool
def list_available_documents() -> str:
    """    
    List all available documents from both local storage and Firestore.
    
    Returns:
        A formatted list of available documents from all sources
    """
    logger.info("TOOL CALL: list_available_documents called")
    
    result = []
    
    # List local documents
    root_dir = os.path.dirname(os.path.dirname(__file__))
    local_docs = []
    
    if os.path.exists(os.path.join(root_dir, "data.docx")):
        local_docs.append("- data.docx")
    
    for file in os.listdir(root_dir):
        if file.endswith(".pdf"):
            local_docs.append(f"- {file}")
    
    if local_docs:
        result.append("=== Local Documents ===\n" + "\n".join(local_docs))
    
    # List Firestore documents
    if db:
        try:
            docs_ref = db.collection("data")
            docs = docs_ref.stream()
            firestore_docs = [f"- {doc.id}" for doc in docs]
            
            if firestore_docs:
                result.append("=== Firestore Documents ===\n" + "\n".join(firestore_docs))
        except Exception as e:
            result.append(f"Error listing Firestore documents: {str(e)}")
    
    response = "\n\n".join(result) if result else "No documents found in any source."
    logger.info(f"TOOL RESULT: list_available_documents found {len(result)} source(s) with documents")
    return response