DigitalIslamicChatbotAPI / tools /document_reader_tool.py
Tahasaif3's picture
'code'
39fc07a
import os
import io
import requests
import logging
from typing import Optional
from agents import function_tool
from docx import Document
import PyPDF2
# Set up logging
logger = logging.getLogger(__name__)
@function_tool
def read_document_data(query: str) -> str:
"""
Read and search for information from documents stored locally.
Args:
query: The search query or topic to look for in the documents
Returns:
The relevant content from the document(s) matching the query
"""
logger.info(f"TOOL CALL: read_document_data called with query='{query}'")
local_content = _read_local_documents(query)
if local_content:
logger.info(f"TOOL RESULT: read_document_data found results")
return local_content
else:
response = f"No relevant information found for query: '{query}'. Please check if documents are available."
logger.info(f"TOOL RESULT: read_document_data found no results for query='{query}'")
return response
def _read_local_documents(query: str) -> Optional[str]:
"""Read from local PDF and DOCX files in the root directory."""
root_dir = os.path.dirname(os.path.dirname(__file__))
content_parts = []
# Try to read DOCX file
docx_path = os.path.join(root_dir, "data.docx")
if os.path.exists(docx_path):
try:
doc = Document(docx_path)
full_text = []
for paragraph in doc.paragraphs:
if paragraph.text.strip():
full_text.append(paragraph.text)
docx_content = "\n".join(full_text)
if docx_content:
content_parts.append(f"[From data.docx]\n{docx_content}")
except Exception as e:
content_parts.append(f"Error reading data.docx: {str(e)}")
# Try to read PDF files
for file in os.listdir(root_dir):
if file.endswith(".pdf"):
pdf_path = os.path.join(root_dir, file)
try:
with open(pdf_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
pdf_text = []
for page in pdf_reader.pages:
text = page.extract_text()
if text.strip():
pdf_text.append(text)
if pdf_text:
content_parts.append(f"[From {file}]\n" + "\n".join(pdf_text))
except Exception as e:
content_parts.append(f"Error reading {file}: {str(e)}")
return "\n\n".join(content_parts) if content_parts else None
def _read_document_from_url(url: str, doc_name: str) -> Optional[str]:
"""Download and read a document (DOCX or PDF) from a URL."""
try:
# Download the file from URL
response = requests.get(url, timeout=30)
response.raise_for_status()
# Determine file type from URL
if url.lower().endswith('.docx') or 'docx' in url.lower():
# Read DOCX from bytes
doc = Document(io.BytesIO(response.content))
full_text = []
for paragraph in doc.paragraphs:
if paragraph.text.strip():
full_text.append(paragraph.text)
return "\n".join(full_text)
elif url.lower().endswith('.pdf') or 'pdf' in url.lower():
# Read PDF from bytes
pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
pdf_text = []
for page in pdf_reader.pages:
text = page.extract_text()
if text.strip():
pdf_text.append(text)
return "\n".join(pdf_text)
else:
return f"Unsupported file type for URL: {url}"
except Exception as e:
raise Exception(f"Failed to download/read document from {url}: {str(e)}")
@function_tool
def list_available_documents() -> str:
"""
List all available documents from local storage.
Returns:
A formatted list of available documents from local sources
"""
logger.info("TOOL CALL: list_available_documents called")
# List local documents
root_dir = os.path.dirname(os.path.dirname(__file__))
local_docs = []
if os.path.exists(os.path.join(root_dir, "data.docx")):
local_docs.append("- data.docx")
for file in os.listdir(root_dir):
if file.endswith(".pdf"):
local_docs.append(f"- {file}")
if local_docs:
response = "=== Local Documents ===\n" + "\n".join(local_docs)
else:
response = "No documents found in local storage."
logger.info(f"TOOL RESULT: list_available_documents found {len(local_docs)} document(s)")
return response