Spaces:
Sleeping
Sleeping
debug
Browse files- src/pdf_parser.py +76 -18
- src/rag_system.py +216 -91
src/pdf_parser.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
-
|
| 2 |
"""
|
| 3 |
-
PDF Parser Module
|
| 4 |
"""
|
| 5 |
import os
|
| 6 |
import json
|
|
@@ -15,10 +14,28 @@ from config import DOCSTORE_PATH, PROCESSED_FILES_LOG
|
|
| 15 |
|
| 16 |
|
| 17 |
class PDFParser:
|
| 18 |
-
def __init__(self):
|
| 19 |
self.docstore_path = Path(DOCSTORE_PATH)
|
| 20 |
self.docstore_path.mkdir(exist_ok=True)
|
| 21 |
self.processed_files = self._load_processed_files()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def _load_processed_files(self) -> Dict[str, str]:
|
| 24 |
"""Load list of already processed files with their hashes"""
|
|
@@ -49,23 +66,48 @@ class PDFParser:
|
|
| 49 |
try:
|
| 50 |
with open(pdf_path, 'rb') as file:
|
| 51 |
reader = PyPDF2.PdfReader(file)
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
except Exception as e:
|
| 55 |
-
|
|
|
|
|
|
|
| 56 |
return text
|
| 57 |
|
| 58 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 59 |
-
"""Extract images from PDF pages"""
|
| 60 |
images_data = []
|
| 61 |
try:
|
|
|
|
|
|
|
| 62 |
images = convert_from_path(pdf_path, dpi=150)
|
|
|
|
|
|
|
| 63 |
for idx, image in enumerate(images):
|
|
|
|
|
|
|
|
|
|
| 64 |
image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
|
| 65 |
image.save(image_path)
|
|
|
|
| 66 |
|
| 67 |
-
# Extract text
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
images_data.append({
|
| 71 |
'page': idx,
|
|
@@ -74,19 +116,20 @@ class PDFParser:
|
|
| 74 |
'description': f"Image from page {idx + 1}"
|
| 75 |
})
|
| 76 |
except Exception as e:
|
| 77 |
-
|
|
|
|
|
|
|
| 78 |
return images_data
|
| 79 |
|
| 80 |
def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 81 |
"""Extract table content from PDF"""
|
| 82 |
tables_data = []
|
| 83 |
try:
|
| 84 |
-
# For simple table extraction, we'll use text patterns
|
| 85 |
-
# For advanced table detection, consider using 'tabula-py' or 'pdfplumber'
|
| 86 |
text = self._extract_text_from_pdf(pdf_path)
|
| 87 |
-
# Basic table detection (lines with multiple spaces or separators)
|
| 88 |
lines = text.split('\n')
|
| 89 |
|
|
|
|
|
|
|
| 90 |
current_table = []
|
| 91 |
for line in lines:
|
| 92 |
if '|' in line or '\t' in line:
|
|
@@ -104,28 +147,41 @@ class PDFParser:
|
|
| 104 |
'content': '\n'.join(current_table),
|
| 105 |
'description': f"Table {len(tables_data) + 1}"
|
| 106 |
})
|
|
|
|
|
|
|
| 107 |
except Exception as e:
|
| 108 |
-
|
|
|
|
| 109 |
return tables_data
|
| 110 |
|
| 111 |
def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 112 |
-
"""Parse PDF and extract text, images, and tables"""
|
| 113 |
file_hash = self._get_file_hash(pdf_path)
|
| 114 |
doc_id = Path(pdf_path).stem
|
| 115 |
|
|
|
|
|
|
|
| 116 |
# Check if file was already processed
|
| 117 |
if doc_id in self.processed_files:
|
| 118 |
if self.processed_files[doc_id] == file_hash:
|
| 119 |
-
|
| 120 |
return self._load_extracted_data(doc_id)
|
| 121 |
|
| 122 |
-
print(f"Processing PDF: {doc_id}")
|
| 123 |
|
| 124 |
# Extract content
|
| 125 |
text = self._extract_text_from_pdf(pdf_path)
|
| 126 |
images = self._extract_images_from_pdf(pdf_path, doc_id)
|
| 127 |
tables = self._extract_tables_from_pdf(pdf_path, doc_id)
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
# Save extracted data
|
| 130 |
self._save_extracted_data(doc_id, text, images, tables)
|
| 131 |
|
|
@@ -145,6 +201,8 @@ class PDFParser:
|
|
| 145 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 146 |
with open(data_path, 'w', encoding='utf-8') as f:
|
| 147 |
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
| 148 |
|
| 149 |
def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 150 |
"""Load previously extracted data from docstore"""
|
|
@@ -166,4 +224,4 @@ class PDFParser:
|
|
| 166 |
all_docs[doc_id] = json.load(f)
|
| 167 |
except:
|
| 168 |
pass
|
| 169 |
-
return all_docs
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
PDF Parser Module with DEBUG for image extraction
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
import json
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
class PDFParser:
|
| 17 |
+
def __init__(self, debug: bool = True):
|
| 18 |
self.docstore_path = Path(DOCSTORE_PATH)
|
| 19 |
self.docstore_path.mkdir(exist_ok=True)
|
| 20 |
self.processed_files = self._load_processed_files()
|
| 21 |
+
self.debug = debug
|
| 22 |
+
|
| 23 |
+
if self.debug:
|
| 24 |
+
print("✅ PDFParser initialized with DEBUG mode ON")
|
| 25 |
+
|
| 26 |
+
def _debug_print(self, label: str, data: any):
|
| 27 |
+
"""Print debug information"""
|
| 28 |
+
if self.debug:
|
| 29 |
+
print(f"\n🔍 [PDF Parser] {label}")
|
| 30 |
+
if isinstance(data, dict):
|
| 31 |
+
for key, val in data.items():
|
| 32 |
+
print(f" {key}: {val}")
|
| 33 |
+
elif isinstance(data, (list, tuple)):
|
| 34 |
+
print(f" Count: {len(data)}")
|
| 35 |
+
for i, item in enumerate(data[:3]):
|
| 36 |
+
print(f" [{i}]: {str(item)[:100]}")
|
| 37 |
+
else:
|
| 38 |
+
print(f" {data}")
|
| 39 |
|
| 40 |
def _load_processed_files(self) -> Dict[str, str]:
|
| 41 |
"""Load list of already processed files with their hashes"""
|
|
|
|
| 66 |
try:
|
| 67 |
with open(pdf_path, 'rb') as file:
|
| 68 |
reader = PyPDF2.PdfReader(file)
|
| 69 |
+
page_count = len(reader.pages)
|
| 70 |
+
self._debug_print("PDF Text Extraction", f"Total pages: {page_count}")
|
| 71 |
+
|
| 72 |
+
for page_num, page in enumerate(reader.pages):
|
| 73 |
+
page_text = page.extract_text()
|
| 74 |
+
text += page_text + "\n"
|
| 75 |
+
self._debug_print(f"Page {page_num+1} Text Length", len(page_text))
|
| 76 |
except Exception as e:
|
| 77 |
+
self._debug_print("ERROR extracting text", str(e))
|
| 78 |
+
|
| 79 |
+
self._debug_print("Total Text Extracted", len(text))
|
| 80 |
return text
|
| 81 |
|
| 82 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 83 |
+
"""Extract images from PDF pages with detailed debugging"""
|
| 84 |
images_data = []
|
| 85 |
try:
|
| 86 |
+
self._debug_print("Image Extraction Started", f"File: {pdf_path}")
|
| 87 |
+
|
| 88 |
images = convert_from_path(pdf_path, dpi=150)
|
| 89 |
+
self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
|
| 90 |
+
|
| 91 |
for idx, image in enumerate(images):
|
| 92 |
+
self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
|
| 93 |
+
|
| 94 |
+
# Save image
|
| 95 |
image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
|
| 96 |
image.save(image_path)
|
| 97 |
+
self._debug_print(f"Image {idx} Saved", str(image_path))
|
| 98 |
|
| 99 |
+
# Extract text using OCR
|
| 100 |
+
self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR...")
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
ocr_text = pytesseract.image_to_string(image, lang='rus+eng')
|
| 104 |
+
self._debug_print(f"Image {idx} OCR Result", f"Length: {len(ocr_text)}, Content: {ocr_text[:200] if ocr_text else 'EMPTY'}")
|
| 105 |
+
|
| 106 |
+
if not ocr_text or len(ocr_text.strip()) < 5:
|
| 107 |
+
self._debug_print(f"Image {idx} WARNING", "⚠️ OCR returned empty or very short text!")
|
| 108 |
+
except Exception as ocr_error:
|
| 109 |
+
self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
|
| 110 |
+
ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"
|
| 111 |
|
| 112 |
images_data.append({
|
| 113 |
'page': idx,
|
|
|
|
| 116 |
'description': f"Image from page {idx + 1}"
|
| 117 |
})
|
| 118 |
except Exception as e:
|
| 119 |
+
self._debug_print("ERROR extracting images", str(e))
|
| 120 |
+
|
| 121 |
+
self._debug_print("Image Extraction Complete", f"Total: {len(images_data)}")
|
| 122 |
return images_data
|
| 123 |
|
| 124 |
def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 125 |
"""Extract table content from PDF"""
|
| 126 |
tables_data = []
|
| 127 |
try:
|
|
|
|
|
|
|
| 128 |
text = self._extract_text_from_pdf(pdf_path)
|
|
|
|
| 129 |
lines = text.split('\n')
|
| 130 |
|
| 131 |
+
self._debug_print("Table Detection", f"Scanning {len(lines)} lines")
|
| 132 |
+
|
| 133 |
current_table = []
|
| 134 |
for line in lines:
|
| 135 |
if '|' in line or '\t' in line:
|
|
|
|
| 147 |
'content': '\n'.join(current_table),
|
| 148 |
'description': f"Table {len(tables_data) + 1}"
|
| 149 |
})
|
| 150 |
+
|
| 151 |
+
self._debug_print("Tables Found", len(tables_data))
|
| 152 |
except Exception as e:
|
| 153 |
+
self._debug_print("ERROR extracting tables", str(e))
|
| 154 |
+
|
| 155 |
return tables_data
|
| 156 |
|
| 157 |
def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 158 |
+
"""Parse PDF and extract text, images, and tables with debug output"""
|
| 159 |
file_hash = self._get_file_hash(pdf_path)
|
| 160 |
doc_id = Path(pdf_path).stem
|
| 161 |
|
| 162 |
+
self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
|
| 163 |
+
|
| 164 |
# Check if file was already processed
|
| 165 |
if doc_id in self.processed_files:
|
| 166 |
if self.processed_files[doc_id] == file_hash:
|
| 167 |
+
self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
|
| 168 |
return self._load_extracted_data(doc_id)
|
| 169 |
|
| 170 |
+
print(f"\n📄 Processing PDF: {doc_id}")
|
| 171 |
|
| 172 |
# Extract content
|
| 173 |
text = self._extract_text_from_pdf(pdf_path)
|
| 174 |
images = self._extract_images_from_pdf(pdf_path, doc_id)
|
| 175 |
tables = self._extract_tables_from_pdf(pdf_path, doc_id)
|
| 176 |
|
| 177 |
+
# Summary
|
| 178 |
+
self._debug_print("Extraction Summary", {
|
| 179 |
+
'text_length': len(text),
|
| 180 |
+
'images_count': len(images),
|
| 181 |
+
'tables_count': len(tables),
|
| 182 |
+
'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
|
| 183 |
+
})
|
| 184 |
+
|
| 185 |
# Save extracted data
|
| 186 |
self._save_extracted_data(doc_id, text, images, tables)
|
| 187 |
|
|
|
|
| 201 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 202 |
with open(data_path, 'w', encoding='utf-8') as f:
|
| 203 |
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 204 |
+
|
| 205 |
+
self._debug_print("Data Saved", str(data_path))
|
| 206 |
|
| 207 |
def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 208 |
"""Load previously extracted data from docstore"""
|
|
|
|
| 224 |
all_docs[doc_id] = json.load(f)
|
| 225 |
except:
|
| 226 |
pass
|
| 227 |
+
return all_docs
|
src/rag_system.py
CHANGED
|
@@ -1,142 +1,267 @@
|
|
| 1 |
"""
|
| 2 |
-
|
|
|
|
| 3 |
"""
|
| 4 |
-
import json
|
| 5 |
-
import hashlib
|
| 6 |
from typing import List, Dict
|
| 7 |
from langchain_openai import ChatOpenAI
|
| 8 |
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
LANGUAGE, CACHE_RESPONSES, BATCH_SEARCH_RESULTS
|
| 12 |
-
)
|
| 13 |
|
| 14 |
|
| 15 |
class MultimodalRAG:
|
| 16 |
-
"""RAG system
|
| 17 |
|
| 18 |
-
def __init__(self, api_key: str = None):
|
| 19 |
api_key = api_key or OPENAI_API_KEY
|
|
|
|
| 20 |
|
| 21 |
-
# Use gpt-4o-mini for 20% cost reduction
|
| 22 |
self.llm = ChatOpenAI(
|
| 23 |
model_name=OPENAI_MODEL,
|
| 24 |
-
|
| 25 |
temperature=TEMPERATURE,
|
| 26 |
max_tokens=MAX_TOKENS,
|
| 27 |
)
|
| 28 |
|
| 29 |
-
self.
|
| 30 |
-
self.doc_summaries = {} # Store doc summaries
|
| 31 |
self.language = LANGUAGE
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
def
|
| 34 |
-
"""
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
def
|
| 38 |
-
"""
|
| 39 |
-
context = ""
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
if content_type == 'image':
|
| 46 |
-
context += f"[
|
| 47 |
elif content_type == 'table':
|
| 48 |
-
context += f"[
|
| 49 |
else:
|
| 50 |
-
context += f"[{idx}]{content}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
return context
|
| 53 |
|
| 54 |
-
def answer_question(self, question: str, search_results: List[Dict]) -> str:
|
| 55 |
-
"""Generate answer
|
| 56 |
-
|
| 57 |
-
# Check cache first
|
| 58 |
-
if CACHE_RESPONSES:
|
| 59 |
-
cache_key = self._get_cache_key(question)
|
| 60 |
-
if cache_key in self.response_cache:
|
| 61 |
-
return self.response_cache[cache_key]
|
| 62 |
-
|
| 63 |
try:
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
-
|
| 68 |
-
prompt = f"""Q:{question}
|
| 69 |
-
C:{context}
|
| 70 |
-
A:"""
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
answer = response.content
|
| 75 |
|
| 76 |
-
#
|
| 77 |
-
|
| 78 |
-
|
| 79 |
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
except Exception as e:
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
def
|
| 86 |
-
"""Summarize document
|
| 87 |
-
|
| 88 |
-
if doc_id in self.doc_summaries:
|
| 89 |
-
return self.doc_summaries[doc_id]
|
| 90 |
-
|
| 91 |
try:
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
summary = response.content
|
| 101 |
|
| 102 |
-
|
| 103 |
-
self.
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
except Exception as e:
|
| 107 |
-
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
def
|
| 110 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
for i, q in enumerate(questions, 1):
|
| 118 |
-
qa_prompt += f"Q{i}:{q}\n"
|
| 119 |
|
| 120 |
-
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
-
def
|
| 133 |
-
"""
|
| 134 |
-
self.
|
| 135 |
|
| 136 |
-
def
|
| 137 |
-
"""
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
'cached_summaries': len(self.doc_summaries),
|
| 141 |
-
'total_cache_size': len(json.dumps(self.response_cache))
|
| 142 |
-
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
LLM Integration Module using OpenAI GPT-4o and LangChain
|
| 3 |
+
FIXED for LangChain 0.1+ with IMAGE DEBUGGING
|
| 4 |
"""
|
|
|
|
|
|
|
| 5 |
from typing import List, Dict
|
| 6 |
from langchain_openai import ChatOpenAI
|
| 7 |
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
|
| 8 |
+
import os
|
| 9 |
+
from config import OPENAI_API_KEY, OPENAI_MODEL, TEMPERATURE, MAX_TOKENS, LANGUAGE
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class MultimodalRAG:
|
| 13 |
+
"""RAG system with multimodal support using LangChain and OpenAI"""
|
| 14 |
|
| 15 |
+
def __init__(self, api_key: str = None, debug: bool = True):
|
| 16 |
api_key = api_key or OPENAI_API_KEY
|
| 17 |
+
self.debug = debug
|
| 18 |
|
|
|
|
| 19 |
self.llm = ChatOpenAI(
|
| 20 |
model_name=OPENAI_MODEL,
|
| 21 |
+
api_key=api_key,
|
| 22 |
temperature=TEMPERATURE,
|
| 23 |
max_tokens=MAX_TOKENS,
|
| 24 |
)
|
| 25 |
|
| 26 |
+
self.conversation_history = []
|
|
|
|
| 27 |
self.language = LANGUAGE
|
| 28 |
+
|
| 29 |
+
if self.debug:
|
| 30 |
+
print("✅ MultimodalRAG initialized with DEBUG mode ON")
|
| 31 |
|
| 32 |
+
def _debug_print(self, label: str, data: any):
|
| 33 |
+
"""Print debug information"""
|
| 34 |
+
if self.debug:
|
| 35 |
+
print(f"\n🔍 DEBUG [{label}]:")
|
| 36 |
+
if isinstance(data, (list, dict)):
|
| 37 |
+
print(f" Type: {type(data).__name__}")
|
| 38 |
+
print(f" Content: {str(data)[:500]}...")
|
| 39 |
+
else:
|
| 40 |
+
print(f" {data}")
|
| 41 |
|
| 42 |
+
def _build_context_prompt(self, search_results: List[Dict]) -> str:
|
| 43 |
+
"""Build context from search results with debug info"""
|
| 44 |
+
context = "Based on the following document content:\n\n"
|
| 45 |
+
|
| 46 |
+
self._debug_print("Search Results Count", len(search_results))
|
| 47 |
|
| 48 |
+
# Separate by type for debugging
|
| 49 |
+
text_count = 0
|
| 50 |
+
image_count = 0
|
| 51 |
+
table_count = 0
|
| 52 |
+
|
| 53 |
+
for idx, result in enumerate(search_results, 1):
|
| 54 |
+
content_type = result.get('type', 'unknown')
|
| 55 |
+
content = result.get('content', '')
|
| 56 |
+
distance = result.get('distance', 0)
|
| 57 |
+
|
| 58 |
+
# Track counts
|
| 59 |
+
if content_type == 'image':
|
| 60 |
+
image_count += 1
|
| 61 |
+
elif content_type == 'table':
|
| 62 |
+
table_count += 1
|
| 63 |
+
else:
|
| 64 |
+
text_count += 1
|
| 65 |
+
|
| 66 |
+
self._debug_print(
|
| 67 |
+
f"Result {idx}: Type={content_type}, Distance={distance:.3f}, Length={len(content)}",
|
| 68 |
+
content[:100]
|
| 69 |
+
)
|
| 70 |
|
| 71 |
if content_type == 'image':
|
| 72 |
+
context += f"[Image {idx}] {content}\n\n"
|
| 73 |
elif content_type == 'table':
|
| 74 |
+
context += f"[Table {idx}] {content}\n\n"
|
| 75 |
else:
|
| 76 |
+
context += f"[Text {idx}] {content}\n\n"
|
| 77 |
+
|
| 78 |
+
self._debug_print("Context Composition",
|
| 79 |
+
f"Text: {text_count}, Images: {image_count}, Tables: {table_count}")
|
| 80 |
+
self._debug_print("Total Context Length", len(context))
|
| 81 |
|
| 82 |
return context
|
| 83 |
|
| 84 |
+
def answer_question(self, question: str, search_results: List[Dict], streaming: bool = False) -> str:
|
| 85 |
+
"""Generate answer to user question based on search results"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
try:
|
| 87 |
+
self._debug_print("Question", question)
|
| 88 |
+
|
| 89 |
+
# Build context from search results
|
| 90 |
+
context = self._build_context_prompt(search_results)
|
| 91 |
+
|
| 92 |
+
# Create system message
|
| 93 |
+
system_message = SystemMessage(
|
| 94 |
+
content=f"""You are a helpful assistant that answers questions about documents.
|
| 95 |
+
You work with documents that contain text, tables, and images.
|
| 96 |
+
Language: {self.language}
|
| 97 |
+
|
| 98 |
+
Provide accurate, concise answers based on the provided context.
|
| 99 |
+
If information is not found in the context, say so clearly.
|
| 100 |
+
For tables and images, provide detailed analysis when relevant."""
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Create user message with context
|
| 104 |
+
user_message = HumanMessage(
|
| 105 |
+
content=f"{context}\n\nQuestion: {question}\n\nPlease answer based on the context above."
|
| 106 |
+
)
|
| 107 |
|
| 108 |
+
self._debug_print("User Message Length", len(user_message.content))
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
# Add to conversation history
|
| 111 |
+
self.conversation_history.append(user_message)
|
|
|
|
| 112 |
|
| 113 |
+
# Get response using .invoke() instead of calling object directly
|
| 114 |
+
self._debug_print("Calling LLM", f"Model: {OPENAI_MODEL}")
|
| 115 |
+
response = self.llm.invoke([system_message] + self.conversation_history)
|
| 116 |
|
| 117 |
+
# Add response to history
|
| 118 |
+
self.conversation_history.append(response)
|
| 119 |
+
|
| 120 |
+
self._debug_print("Response Length", len(response.content))
|
| 121 |
+
|
| 122 |
+
# Keep conversation history manageable (last 10 messages)
|
| 123 |
+
if len(self.conversation_history) > 10:
|
| 124 |
+
self.conversation_history = self.conversation_history[-10:]
|
| 125 |
+
|
| 126 |
+
return response.content
|
| 127 |
|
| 128 |
except Exception as e:
|
| 129 |
+
self._debug_print("ERROR in answer_question", str(e))
|
| 130 |
+
print(f"Error generating answer: {e}")
|
| 131 |
+
return f"Error: Could not generate answer. {str(e)}"
|
| 132 |
|
| 133 |
+
def summarize_document(self, document_content: str, images: List[Dict] = None, tables: List[Dict] = None) -> str:
|
| 134 |
+
"""Summarize extracted document content including images and tables"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
try:
|
| 136 |
+
if images is None:
|
| 137 |
+
images = []
|
| 138 |
+
if tables is None:
|
| 139 |
+
tables = []
|
| 140 |
+
|
| 141 |
+
self._debug_print("Document Summarization Started",
|
| 142 |
+
f"Text length: {len(document_content)}, Images: {len(images)}, Tables: {len(tables)}")
|
| 143 |
+
|
| 144 |
+
# Extract OCR text from images
|
| 145 |
+
image_ocr_texts = []
|
| 146 |
+
for idx, img in enumerate(images):
|
| 147 |
+
ocr_text = img.get('ocr_text', '')
|
| 148 |
+
if ocr_text:
|
| 149 |
+
image_ocr_texts.append(f"Image {idx}: {ocr_text}")
|
| 150 |
+
self._debug_print(f"Image {idx} OCR", ocr_text[:100])
|
| 151 |
+
else:
|
| 152 |
+
self._debug_print(f"Image {idx} OCR", "⚠️ EMPTY - No OCR text extracted!")
|
| 153 |
+
|
| 154 |
+
# Extract table content
|
| 155 |
+
table_texts = []
|
| 156 |
+
for idx, tbl in enumerate(tables):
|
| 157 |
+
table_content = tbl.get('content', '')
|
| 158 |
+
if table_content:
|
| 159 |
+
table_texts.append(f"Table {idx}:\n{table_content}")
|
| 160 |
+
self._debug_print(f"Table {idx} Content", table_content[:100])
|
| 161 |
+
else:
|
| 162 |
+
self._debug_print(f"Table {idx} Content", "⚠️ EMPTY - No table content!")
|
| 163 |
+
|
| 164 |
+
# Build comprehensive summary prompt
|
| 165 |
+
summary_prompt = f"""Please provide a comprehensive summary of the following document content in {self.language}.
|
| 166 |
+
|
| 167 |
+
Document Text:
|
| 168 |
+
{document_content}
|
| 169 |
+
|
| 170 |
+
"""
|
| 171 |
+
|
| 172 |
+
# Add images if they have OCR text
|
| 173 |
+
if image_ocr_texts:
|
| 174 |
+
summary_prompt += f"\nExtracted text from {len(images)} images:\n"
|
| 175 |
+
summary_prompt += "\n".join(image_ocr_texts)
|
| 176 |
+
summary_prompt += "\n"
|
| 177 |
+
|
| 178 |
+
# Add tables
|
| 179 |
+
if table_texts:
|
| 180 |
+
summary_prompt += f"\nDocument contains {len(tables)} tables:\n"
|
| 181 |
+
summary_prompt += "\n".join(table_texts)
|
| 182 |
+
summary_prompt += "\n"
|
| 183 |
|
| 184 |
+
summary_prompt += f"""
|
| 185 |
+
Please include in your summary:
|
| 186 |
+
1. Main topics covered
|
| 187 |
+
2. Key points and findings
|
| 188 |
+
3. Important data and numbers
|
| 189 |
+
4. Key information from images (if present)
|
| 190 |
+
5. Key information from tables (if present)
|
| 191 |
+
6. Overall document purpose"""
|
| 192 |
|
| 193 |
+
self._debug_print("Summary Prompt Length", len(summary_prompt))
|
| 194 |
+
self._debug_print("Summary Prompt Content", summary_prompt[:200])
|
|
|
|
| 195 |
|
| 196 |
+
message = HumanMessage(content=summary_prompt)
|
| 197 |
+
self._debug_print("Calling LLM for summarization", f"Model: {OPENAI_MODEL}")
|
| 198 |
+
|
| 199 |
+
response = self.llm.invoke([message])
|
| 200 |
+
|
| 201 |
+
self._debug_print("Summary Response Length", len(response.content))
|
| 202 |
+
|
| 203 |
+
return response.content
|
| 204 |
|
| 205 |
except Exception as e:
|
| 206 |
+
self._debug_print("ERROR in summarize_document", str(e))
|
| 207 |
+
print(f"Error summarizing document: {e}")
|
| 208 |
+
return f"Error: Could not summarize document. {str(e)}"
|
| 209 |
|
| 210 |
+
def debug_search_results(self, search_results: List[Dict]) -> Dict:
|
| 211 |
+
"""Detailed analysis of search results for debugging"""
|
| 212 |
+
analysis = {
|
| 213 |
+
'total_results': len(search_results),
|
| 214 |
+
'by_type': {'text': 0, 'image': 0, 'table': 0},
|
| 215 |
+
'average_distance': 0,
|
| 216 |
+
'images_with_content': 0,
|
| 217 |
+
'images_empty': 0,
|
| 218 |
+
'details': []
|
| 219 |
+
}
|
| 220 |
|
| 221 |
+
distances = []
|
| 222 |
+
|
| 223 |
+
for idx, result in enumerate(search_results):
|
| 224 |
+
content_type = result.get('type', 'unknown')
|
| 225 |
+
content = result.get('content', '')
|
| 226 |
+
distance = result.get('distance', 0)
|
| 227 |
|
| 228 |
+
if content_type in analysis['by_type']:
|
| 229 |
+
analysis['by_type'][content_type] += 1
|
|
|
|
|
|
|
| 230 |
|
| 231 |
+
distances.append(distance)
|
| 232 |
|
| 233 |
+
# Track image specifics
|
| 234 |
+
if content_type == 'image':
|
| 235 |
+
if content.strip():
|
| 236 |
+
analysis['images_with_content'] += 1
|
| 237 |
+
else:
|
| 238 |
+
analysis['images_empty'] += 1
|
| 239 |
|
| 240 |
+
analysis['details'].append({
|
| 241 |
+
'index': idx,
|
| 242 |
+
'type': content_type,
|
| 243 |
+
'distance': distance,
|
| 244 |
+
'content_length': len(content),
|
| 245 |
+
'has_content': bool(content.strip())
|
| 246 |
+
})
|
| 247 |
|
| 248 |
+
if distances:
|
| 249 |
+
analysis['average_distance'] = sum(distances) / len(distances)
|
| 250 |
+
|
| 251 |
+
self._debug_print("Search Results Analysis", analysis)
|
| 252 |
+
return analysis
|
| 253 |
+
|
| 254 |
+
def clear_history(self):
|
| 255 |
+
"""Clear conversation history"""
|
| 256 |
+
self.conversation_history = []
|
| 257 |
+
if self.debug:
|
| 258 |
+
print("✅ Conversation history cleared")
|
| 259 |
|
| 260 |
+
def get_history(self) -> List:
|
| 261 |
+
"""Get conversation history"""
|
| 262 |
+
return self.conversation_history
|
| 263 |
|
| 264 |
+
def toggle_debug(self, enabled: bool):
|
| 265 |
+
"""Toggle debug mode on/off"""
|
| 266 |
+
self.debug = enabled
|
| 267 |
+
print(f"🔍 Debug mode: {'ON' if enabled else 'OFF'}")
|
|
|
|
|
|
|
|
|