Spaces:

MuhammadSaad16
/

jobobike_chatbot

Runtime error

App Files Files Community

jobobike_chatbot / tools /document_reader_tool.py

MuhammadSaad16

Add application file

eb60b56 3 months ago

raw

history blame contribute delete

7.92 kB

	import os
	import io
	import requests
	import logging
	from typing import Optional
	from agents import function_tool
	from docx import Document
	import PyPDF2
	from .firebase_config import db

	# Set up logging
	logger = logging.getLogger(__name__)


	@function_tool
	def read_document_data(query: str, source: str = "auto") -> str:
	"""
	Read and search for information from documents stored locally or in Firebase Firestore.

	Args:
	query: The search query or topic to look for in the documents
	source: Data source - "local" for local files, "firestore" for Firebase, or "auto" to try both

	Returns:
	The relevant content from the document(s) matching the query
	"""
	logger.info(f"TOOL CALL: read_document_data called with query='{query}', source='{source}'")

	result = []

	# Try local files first if source is "local" or "auto"
	if source in ["local", "auto"]:
	local_content = _read_local_documents(query)
	if local_content:
	result.append(f"=== Local Documents ===\n{local_content}")

	# Try Firestore if source is "firestore" or "auto" (and local didn't return results)
	if source in ["firestore", "auto"] and (not result or source == "firestore"):
	firestore_content = _read_firestore_documents(query)
	if firestore_content:
	result.append(f"=== Firestore Documents ===\n{firestore_content}")

	if result:
	response = "\n\n".join(result)
	logger.info(f"TOOL RESULT: read_document_data found {len(result)} result(s)")
	return response
	else:
	response = f"No relevant information found for query: '{query}'. Please check if documents are available."
	logger.info(f"TOOL RESULT: read_document_data found no results for query='{query}'")
	return response

	def _read_local_documents(query: str) -> Optional[str]:
	"""Read from local PDF and DOCX files in the root directory."""
	root_dir = os.path.dirname(os.path.dirname(__file__))
	content_parts = []

	# Try to read DOCX file
	docx_path = os.path.join(root_dir, "data.docx")
	if os.path.exists(docx_path):
	try:
	doc = Document(docx_path)
	full_text = []
	for paragraph in doc.paragraphs:
	if paragraph.text.strip():
	full_text.append(paragraph.text)

	docx_content = "\n".join(full_text)
	if docx_content:
	content_parts.append(f"[From data.docx]\n{docx_content}")
	except Exception as e:
	content_parts.append(f"Error reading data.docx: {str(e)}")

	# Try to read PDF files
	for file in os.listdir(root_dir):
	if file.endswith(".pdf"):
	pdf_path = os.path.join(root_dir, file)
	try:
	with open(pdf_path, "rb") as pdf_file:
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	pdf_text = []
	for page in pdf_reader.pages:
	text = page.extract_text()
	if text.strip():
	pdf_text.append(text)

	if pdf_text:
	content_parts.append(f"[From {file}]\n" + "\n".join(pdf_text))
	except Exception as e:
	content_parts.append(f"Error reading {file}: {str(e)}")

	return "\n\n".join(content_parts) if content_parts else None


	def _read_firestore_documents(query: str) -> Optional[str]:
	"""Read documents from Firebase Firestore 'data' collection."""
	if not db:
	return "Firebase Firestore is not initialized. Please check your serviceAccount.json file."

	try:
	# Query the 'data' collection
	docs_ref = db.collection("data")
	docs = docs_ref.stream()

	content_parts = []
	for doc in docs:
	doc_data = doc.to_dict()

	# Check if document field contains a URL to a file
	document_url = doc_data.get("document")

	if document_url:
	# Download and read the document from URL
	try:
	doc_name = doc_data.get("name", doc.id)
	content = _read_document_from_url(document_url, doc_name)
	if content:
	content_parts.append(f"[From Firestore: {doc_name}]\n{content}")
	except Exception as e:
	content_parts.append(f"[Error reading {doc.id}]: {str(e)}")
	else:
	# Fallback: Try to extract content from different possible field names
	doc_content = (
	doc_data.get("content") or
	doc_data.get("text") or
	doc_data.get("data")
	)

	if doc_content:
	doc_name = doc_data.get("name", doc.id)
	content_parts.append(f"[From Firestore: {doc_name}]\n{doc_content}")

	return "\n\n".join(content_parts) if content_parts else None

	except Exception as e:
	return f"Error reading from Firestore: {str(e)}"


	def _read_document_from_url(url: str, doc_name: str) -> Optional[str]:
	"""Download and read a document (DOCX or PDF) from a URL."""
	try:
	# Download the file from URL
	response = requests.get(url, timeout=30)
	response.raise_for_status()

	# Determine file type from URL
	if url.lower().endswith('.docx') or 'docx' in url.lower():
	# Read DOCX from bytes
	doc = Document(io.BytesIO(response.content))
	full_text = []
	for paragraph in doc.paragraphs:
	if paragraph.text.strip():
	full_text.append(paragraph.text)
	return "\n".join(full_text)

	elif url.lower().endswith('.pdf') or 'pdf' in url.lower():
	# Read PDF from bytes
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
	pdf_text = []
	for page in pdf_reader.pages:
	text = page.extract_text()
	if text.strip():
	pdf_text.append(text)
	return "\n".join(pdf_text)

	else:
	return f"Unsupported file type for URL: {url}"

	except Exception as e:
	raise Exception(f"Failed to download/read document from {url}: {str(e)}")


	@function_tool
	def list_available_documents() -> str:
	"""
	List all available documents from both local storage and Firestore.

	Returns:
	A formatted list of available documents from all sources
	"""
	logger.info("TOOL CALL: list_available_documents called")

	result = []

	# List local documents
	root_dir = os.path.dirname(os.path.dirname(__file__))
	local_docs = []

	if os.path.exists(os.path.join(root_dir, "data.docx")):
	local_docs.append("- data.docx")

	for file in os.listdir(root_dir):
	if file.endswith(".pdf"):
	local_docs.append(f"- {file}")

	if local_docs:
	result.append("=== Local Documents ===\n" + "\n".join(local_docs))

	# List Firestore documents
	if db:
	try:
	docs_ref = db.collection("data")
	docs = docs_ref.stream()
	firestore_docs = [f"- {doc.id}" for doc in docs]

	if firestore_docs:
	result.append("=== Firestore Documents ===\n" + "\n".join(firestore_docs))
	except Exception as e:
	result.append(f"Error listing Firestore documents: {str(e)}")

	response = "\n\n".join(result) if result else "No documents found in any source."
	logger.info(f"TOOL RESULT: list_available_documents found {len(result)} source(s) with documents")
	return response