Spaces:
Sleeping
Sleeping
File size: 6,408 Bytes
d2fe6cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
from pdfHandler import PDFProcessor
from docHandler import DocProcessor
from txtHandler import TextProcessor
from webHandler import WebProcessor
from typing import Dict, Any, List
class DocumentManager:
def __init__(self):
self.pdf_processor = PDFProcessor()
self.doc_processor = DocProcessor()
self.txt_processor = TextProcessor()
self.web_processor = WebProcessor()
# Store multiple processed documents
self.processed_documents = [] # List of {"processor": processor, "file_path": path, "content_type": type}
self.all_content = "" # Combined content for multi-document queries
def process_document(self, file_path: str, content_type: str) -> Dict[str, Any]:
try:
result = {"status": "error", "message": "Unknown file type"}
processor = None
print(f"Processing file: {file_path} with content type: {content_type}")
if content_type == "application/pdf":
result = self.pdf_processor.process_pdf(file_path)
processor = self.pdf_processor
elif content_type == "application/msword":
result = self.doc_processor.process_docx(file_path)
processor = self.doc_processor
elif content_type == "text/plain":
result = self.txt_processor.process_text(file_path)
processor = self.txt_processor
elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
result = self.doc_processor.process_docx(file_path)
processor = self.doc_processor
if result["status"] == "success" and processor:
# Add to processed documents list
doc_info = {
"processor": processor,
"file_path": file_path,
"content_type": content_type,
"filename": file_path.split('/')[-1] # Extract filename
}
self.processed_documents.append(doc_info)
# Update combined content for multi-document queries
# Assuming processors have a method to get content
try:
if hasattr(processor, 'get_content'):
content = processor.get_content()
self.all_content += f"\n\n--- Document: {doc_info['filename']} ---\n{content}"
except:
pass
print(f"Document added to collection. Total documents: {len(self.processed_documents)}")
return result
except Exception as e:
return {"status": "error", "message": str(e)}
def query_document(self, query: str) -> Dict[str, Any]:
if not self.processed_documents:
return {"status": "error", "message": "No documents processed"}
print(f"Querying {len(self.processed_documents)} documents with question: {query}")
try:
# Strategy 1: Try to query each document and combine results
all_responses = []
for i, doc_info in enumerate(self.processed_documents):
processor = doc_info["processor"]
filename = doc_info["filename"]
just_filename = filename.split('\\')[-1]
# Query individual document
try:
response = processor.query_response(query)
if response.get("status") == "success":
answer = response.get("answer", "")
if answer and answer.strip():
all_responses.append(f"From {just_filename}:\n {answer}")
except Exception as e:
print(f"Error querying {filename}: {e}")
continue
if not all_responses:
return {"status": "error", "message": "No relevant information found in any documents"}
# Combine all responses
combined_answer = "\n\n".join(all_responses)
return {
"status": "success",
"answer": combined_answer
}
except Exception as e:
# Fallback: Use the last processed document
print(f"Multi-document query failed, using last document: {e}")
last_processor = self.processed_documents[-1]["processor"]
return last_processor.query_response(query)
def clear_documents(self):
"""Clear all previously processed documents"""
self.processed_documents = []
self.all_content = ""
print("All documents cleared - ready for new uploads")
def process_url(self, url: str) -> Dict[str, Any]:
"""Process a URL and add it to the document collection"""
try:
result = self.web_processor.process_url(url)
if result["status"] == "success":
# Add URL to processed documents
doc_info = {
"processor": self.web_processor,
"file_path": url,
"content_type": "text/html",
"filename": f"webpage_{url.split('/')[-1] or 'index'}"
}
self.processed_documents.append(doc_info)
# Update combined content
try:
if hasattr(self.web_processor, 'get_content'):
content = self.web_processor.get_content()
self.all_content += f"\n\n--- Web Page: {url} ---\n{content}"
except:
pass
print(f"URL processed and added to collection: {url}")
return result
except Exception as e:
return {"status": "error", "message": str(e)}
def get_status(self) -> Dict[str, Any]:
"""Get current status of processed documents"""
return {
"total_documents": len(self.processed_documents),
"document_types": list(set([doc["content_type"] for doc in self.processed_documents])),
"filenames": [doc["filename"] for doc in self.processed_documents]
} |