dnj0 commited on
Commit
835ecb4
·
verified ·
1 Parent(s): 2b7f331

Upload 7 files

Browse files
src/.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_MODEL=gpt-4o-mini
src/app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from pathlib import Path
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables
7
+ load_dotenv()
8
+
9
+ # Import custom modules
10
+ from pdf_processor import PDFProcessor, prepare_documents_for_embedding
11
+ from embeddings_handler import CLIPLangChainEmbeddings
12
+ from vectorstore_manager import VectorStoreManager
13
+ from image_summarizer import ImageSummarizer, process_images_in_documents
14
+ from rag_chain import RAGChain
15
+ from langchain_core.documents import Document
16
+
17
+ # Page configuration
18
+ st.set_page_config(
19
+ page_title="Multimodal RAG Assistant",
20
+ page_icon="📄",
21
+ layout="wide",
22
+ initial_sidebar_state="expanded"
23
+ )
24
+
25
+ st.markdown("""
26
+ <style>
27
+ .main {
28
+ padding: 2rem;
29
+ }
30
+ .stChatMessage {
31
+ padding: 1rem;
32
+ border-radius: 0.5rem;
33
+ margin-bottom: 1rem;
34
+ }
35
+ </style>
36
+ """, unsafe_allow_html=True)
37
+
38
+ # Initialize session state
39
+ if "vector_store" not in st.session_state:
40
+ st.session_state.vector_store = None
41
+ if "rag_chain" not in st.session_state:
42
+ st.session_state.rag_chain = None
43
+ if "document_count" not in st.session_state:
44
+ st.session_state.document_count = 0
45
+
46
+ # Sidebar configuration
47
+ st.sidebar.title("⚙️ Configuration")
48
+ st.sidebar.markdown("---")
49
+
50
+ # OpenAI API Key
51
+ api_key = st.sidebar.text_input(
52
+ "OpenAI API Key",
53
+ type="password",
54
+ value=os.getenv("OPENAI_API_KEY", ""),
55
+ help="Enter your OpenAI API key"
56
+ )
57
+
58
+ if api_key:
59
+ os.environ["OPENAI_API_KEY"] = api_key
60
+
61
+ # PDF directory setup
62
+ pdf_dir = st.sidebar.text_input(
63
+ "PDF Directory",
64
+ value="./pdfs",
65
+ help="Directory containing PDF files"
66
+ )
67
+
68
+ # Vector store settings
69
+ st.sidebar.markdown("### Vector Store")
70
+ collection_name = st.sidebar.text_input(
71
+ "Collection Name",
72
+ value="pdf_documents",
73
+ help="ChromaDB collection name"
74
+ )
75
+
76
+ persist_dir = st.sidebar.text_input(
77
+ "Persist Directory",
78
+ value="./chroma_db",
79
+ help="Directory for ChromaDB storage"
80
+ )
81
+
82
+ # Initialize vector store button
83
+ if st.sidebar.button("🔄 Initialize Vector Store", use_container_width=True):
84
+ with st.spinner("Initializing vector store..."):
85
+ try:
86
+ # Initialize embeddings
87
+ embeddings = CLIPLangChainEmbeddings(
88
+ model_name="ViT-B-32",
89
+ pretrained="openai"
90
+ )
91
+
92
+ # Initialize vector store
93
+ st.session_state.vector_store = VectorStoreManager(
94
+ persist_dir=persist_dir,
95
+ collection_name=collection_name,
96
+ embeddings=embeddings
97
+ )
98
+
99
+ # Initialize RAG chain
100
+ retriever = st.session_state.vector_store.get_retriever()
101
+ st.session_state.rag_chain = RAGChain(retriever, api_key=api_key)
102
+
103
+ st.session_state.document_count = st.session_state.vector_store.collection_count()
104
+ st.success("✅ Vector store initialized!")
105
+
106
+ except Exception as e:
107
+ st.error(f"❌ Error initializing vector store: {str(e)}")
108
+
109
+ # Load and process PDFs button
110
+ if st.sidebar.button("📥 Load & Process PDFs", use_container_width=True):
111
+ if not api_key:
112
+ st.error("Please enter OpenAI API Key first")
113
+ elif st.session_state.vector_store is None:
114
+ st.error("Please initialize vector store first")
115
+ else:
116
+ with st.spinner("Processing PDFs..."):
117
+ try:
118
+ # Process PDFs
119
+ pdf_processor = PDFProcessor(pdf_dir=pdf_dir)
120
+ documents_data = pdf_processor.process_all_pdfs()
121
+
122
+ if not documents_data:
123
+ st.warning(f"No PDFs found in {pdf_dir}")
124
+ else:
125
+ # Summarize images
126
+ image_summarizer = ImageSummarizer(api_key=api_key)
127
+ documents_data = process_images_in_documents(
128
+ documents_data,
129
+ image_summarizer
130
+ )
131
+
132
+ # Prepare documents for embedding
133
+ all_documents = []
134
+ for doc_data in documents_data:
135
+ doc_tuples = prepare_documents_for_embedding(doc_data)
136
+ for text, metadata in doc_tuples:
137
+ all_documents.append(
138
+ Document(page_content=text, metadata=metadata)
139
+ )
140
+
141
+ # Add to vector store
142
+ st.session_state.vector_store.add_documents(all_documents)
143
+ st.session_state.document_count = st.session_state.vector_store.collection_count()
144
+
145
+ # Reinitialize RAG chain
146
+ retriever = st.session_state.vector_store.get_retriever()
147
+ st.session_state.rag_chain = RAGChain(retriever, api_key=api_key)
148
+
149
+ st.success(f"✅ Processed {len(documents_data)} PDFs with {len(all_documents)} chunks")
150
+ st.info(f"Total documents in store: {st.session_state.document_count}")
151
+
152
+ except Exception as e:
153
+ st.error(f"❌ Error processing PDFs: {str(e)}")
154
+
155
+ # Display vector store status
156
+ st.sidebar.markdown("### Status")
157
+ if st.session_state.vector_store:
158
+ doc_count = st.session_state.vector_store.collection_count()
159
+ st.sidebar.success(f"✅ Vector Store Ready")
160
+ st.sidebar.metric("Documents in Store", doc_count)
161
+ else:
162
+ st.sidebar.warning("⚠️ Vector Store Not Initialized")
163
+
164
+ # Main content area
165
+ st.title("📄 Multimodal PDF RAG Assistant")
166
+ st.markdown("Ask questions about your PDF documents. Responses will be provided in Russian.")
167
+
168
+ # Check if system is ready
169
+ if st.session_state.rag_chain is None:
170
+ st.info("""
171
+ ### Getting Started:
172
+ 1. Enter your OpenAI API Key in the sidebar
173
+ 2. Click "Initialize Vector Store"
174
+ 3. Place PDF files in the configured directory
175
+ 4. Click "Load & Process PDFs"
176
+ 5. Ask questions in the chat below
177
+ """)
178
+ else:
179
+ # Chat interface
180
+ st.markdown("---")
181
+ st.markdown("### Ask a Question")
182
+
183
+ col1, col2 = st.columns([1, 0.15])
184
+
185
+ with col1:
186
+ user_question = st.text_input(
187
+ "Your question:",
188
+ placeholder="Ask about your documents...",
189
+ label_visibility="collapsed"
190
+ )
191
+
192
+ with col2:
193
+ search_button = st.button("🔍 Search", use_container_width=True)
194
+
195
+ # Process question
196
+ if search_button and user_question:
197
+ with st.spinner("🤖 Searching documents and generating response..."):
198
+ try:
199
+ result = st.session_state.rag_chain.query(user_question)
200
+
201
+ # Display answer
202
+ st.markdown("### Answer")
203
+ st.markdown(result["answer"])
204
+
205
+ # Display sources
206
+ if result["sources"]:
207
+ st.markdown("### Sources")
208
+ for i, source in enumerate(result["sources"], 1):
209
+ with st.expander(f"Source {i} - {source['metadata'].get('filename', 'Unknown')}"):
210
+ st.markdown(f"**Type:** {source['metadata'].get('type', 'Unknown')}")
211
+ st.markdown(f"**Page:** {source['metadata'].get('page', 'Unknown')}")
212
+ st.markdown(f"**Content:** {source['content']}")
213
+
214
+ except Exception as e:
215
+ st.error(f"Error processing question: {str(e)}")
216
+
217
+ # Footer
218
+ st.markdown("---")
219
+ st.markdown("""
220
+ <div style="text-align: center; color: gray; font-size: 0.8rem;">
221
+ Powered by LangChain, ChromaDB, CLIP, and OpenAI
222
+ </div>
223
+ """, unsafe_allow_html=True)
src/embeddings_handler.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import open_clip
3
+ from typing import List
4
+ import numpy as np
5
+
6
+ class CLIPEmbeddingsHandler:
7
+ """Handles CLIP embeddings for multimodal content."""
8
+
9
+ def __init__(self, model_name: str = "ViT-B-32", pretrained: str = "openai"):
10
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
11
+
12
+ try:
13
+ # FIXED: Correctly unpack 3 return values
14
+ self.model, _, self.preprocess = open_clip.create_model_and_transforms(
15
+ model_name,
16
+ pretrained=pretrained,
17
+ device=self.device
18
+ )
19
+
20
+ self.tokenizer = open_clip.get_tokenizer(model_name)
21
+ self.model.eval() # Set to evaluation mode
22
+
23
+ print(f"✅ CLIP model loaded on {self.device}")
24
+ print(f" Model: {model_name}")
25
+
26
+ except Exception as e:
27
+ print(f"❌ Error loading CLIP model: {e}")
28
+ raise
29
+
30
+ def embed_text(self, texts: List[str]) -> np.ndarray:
31
+ """Generate embeddings for text."""
32
+ embeddings = []
33
+
34
+ with torch.no_grad():
35
+ for text in texts:
36
+ try:
37
+ tokens = self.tokenizer(text).to(self.device)
38
+ text_features = self.model.encode_text(tokens)
39
+ text_features /= text_features.norm(dim=-1, keepdim=True)
40
+ embeddings.append(text_features.cpu().numpy())
41
+ except Exception as e:
42
+ print(f"⚠️ Error embedding text: {e}")
43
+ embeddings.append(np.zeros(512))
44
+
45
+ result = np.array(embeddings).squeeze()
46
+ if len(result.shape) == 1:
47
+ result = np.expand_dims(result, axis=0)
48
+ return result
49
+
50
+ def embed_image_base64(self, image_base64: str) -> np.ndarray:
51
+ """Generate embedding for base64 encoded image."""
52
+ import base64
53
+ import io
54
+ from PIL import Image
55
+
56
+ try:
57
+ image_data = base64.b64decode(image_base64)
58
+ image = Image.open(io.BytesIO(image_data)).convert("RGB")
59
+
60
+ # Use the evaluation preprocessing
61
+ image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
62
+
63
+ with torch.no_grad():
64
+ image_features = self.model.encode_image(image_tensor)
65
+ image_features /= image_features.norm(dim=-1, keepdim=True)
66
+
67
+ return image_features.cpu().numpy().squeeze()
68
+
69
+ except Exception as e:
70
+ print(f"❌ Error embedding image: {e}")
71
+ return np.zeros(512)
72
+
73
+
74
+ # LangChain wrapper
75
+ from langchain_core.embeddings import Embeddings
76
+
77
+ class CLIPLangChainEmbeddings(Embeddings):
78
+ """LangChain wrapper for CLIP embeddings."""
79
+
80
+ def __init__(self, model_name: str = "ViT-B-32", pretrained: str = "openai"):
81
+ self.handler = CLIPEmbeddingsHandler(model_name, pretrained)
82
+
83
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
84
+ """Embed search docs."""
85
+ embeddings = self.handler.embed_text(texts)
86
+ if len(embeddings.shape) == 1:
87
+ return [embeddings.tolist()]
88
+ return embeddings.tolist()
89
+
90
+ def embed_query(self, text: str) -> List[float]:
91
+ """Embed query text."""
92
+ embedding = self.handler.embed_text([text])
93
+ if len(embedding.shape) == 1:
94
+ return embedding.tolist()
95
+ return embedding[0].tolist()
src/image_summarizer.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ from typing import Optional
4
+ from openai import OpenAI
5
+
6
+ class ImageSummarizer:
7
+ """Summarizes images using OpenAI's vision API."""
8
+
9
+ def __init__(self, api_key: Optional[str] = None):
10
+ """Initialize OpenAI client."""
11
+ self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY"))
12
+
13
+ def summarize_image_base64(self,
14
+ image_base64: str,
15
+ image_format: str = "png") -> str:
16
+ """
17
+ Summarize image using OpenAI vision.
18
+
19
+ Args:
20
+ image_base64: Base64 encoded image
21
+ image_format: Image format (png, jpg, etc.)
22
+
23
+ Returns:
24
+ Image description/summary
25
+ """
26
+ try:
27
+ response = self.client.chat.completions.create(
28
+ model="gpt-4o-mini", # or "gpt-4-vision-preview"
29
+ messages=[
30
+ {
31
+ "role": "user",
32
+ "content": [
33
+ {
34
+ "type": "image_url",
35
+ "image_url": {
36
+ "url": f"data:image/{image_format};base64,{image_base64}"
37
+ }
38
+ },
39
+ {
40
+ "type": "text",
41
+ "text": "Пожалуйста, опишите детально содержание этого изображения на русском языке. Укажите все видимые объекты, текст, диаграммы, графики и их взаимосвязь."
42
+ }
43
+ ]
44
+ }
45
+ ],
46
+ max_tokens=500
47
+ )
48
+
49
+ return response.choices[0].message.content
50
+
51
+ except Exception as e:
52
+ print(f"Error summarizing image: {e}")
53
+ return f"Изображение на странице (ошибка обработки: {str(e)})"
54
+
55
+
56
+ def process_images_in_documents(documents_data: list,
57
+ image_summarizer: ImageSummarizer) -> list:
58
+ """
59
+ Process images in extracted PDF documents and add summaries.
60
+
61
+ Args:
62
+ documents_data: List of document content dictionaries
63
+ image_summarizer: ImageSummarizer instance
64
+
65
+ Returns:
66
+ Updated documents with image summaries
67
+ """
68
+ for doc in documents_data:
69
+ for page in doc.get("pages", []):
70
+ for image in page.get("images", []):
71
+ if image.get("base64"):
72
+ print(f"Summarizing image from page {page.get('page_number')}")
73
+ summary = image_summarizer.summarize_image_base64(
74
+ image.get("base64"),
75
+ image.get("format", "png")
76
+ )
77
+ image["summary"] = summary
78
+
79
+ return documents_data
src/pdf_processor.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import base64
4
+ import hashlib
5
+ from pathlib import Path
6
+ from typing import List, Dict, Tuple
7
+ import pdfplumber
8
+ import pymupdf
9
+ from PIL import Image
10
+ import io
11
+
12
+ class PDFProcessor:
13
+ """Processes PDFs to extract text, tables, and images."""
14
+
15
+ def __init__(self, pdf_dir: str = "./pdfs", cache_file: str = ".pdf_cache.json"):
16
+ self.pdf_dir = pdf_dir
17
+ self.cache_file = cache_file
18
+ self.cache = self._load_cache()
19
+ os.makedirs(pdf_dir, exist_ok=True)
20
+
21
+ def _load_cache(self) -> Dict:
22
+ """Load processing cache to avoid reprocessing PDFs."""
23
+ if os.path.exists(self.cache_file):
24
+ with open(self.cache_file, 'r') as f:
25
+ return json.load(f)
26
+ return {}
27
+
28
+ def _save_cache(self):
29
+ """Save processing cache."""
30
+ with open(self.cache_file, 'w') as f:
31
+ json.dump(self.cache, f, indent=2)
32
+
33
+ def _get_file_hash(self, filepath: str) -> str:
34
+ """Generate hash of file for change detection."""
35
+ hash_md5 = hashlib.md5()
36
+ with open(filepath, "rb") as f:
37
+ for chunk in iter(lambda: f.read(4096), b""):
38
+ hash_md5.update(chunk)
39
+ return hash_md5.hexdigest()
40
+
41
+ def _extract_images_from_page(self, pdf_path: str, page_num: int) -> List[Dict]:
42
+ """Extract images from specific page using PyMuPDF."""
43
+ images = []
44
+ try:
45
+ doc = pymupdf.open(pdf_path)
46
+
47
+ # Verify page exists
48
+ if page_num >= len(doc):
49
+ print(f"⚠️ Page {page_num} does not exist")
50
+ doc.close()
51
+ return images
52
+
53
+ page = doc[page_num]
54
+
55
+ # Get image list - returns list of tuples
56
+ image_list = page.get_images()
57
+
58
+ if not image_list:
59
+ doc.close()
60
+ return images
61
+
62
+ print(f"Found {len(image_list)} images on page {page_num}")
63
+
64
+ # Process each image
65
+ for img_index, img_info in enumerate(image_list):
66
+ try:
67
+ # FIXED: Extract xref from tuple (first element)
68
+ xref = img_info[0]
69
+
70
+ # Validate xref is integer
71
+ if not isinstance(xref, int):
72
+ print(f"⚠️ Invalid xref type: {type(xref).__name__}")
73
+ continue
74
+
75
+ # Extract image
76
+ img_data = doc.extract_image(xref)
77
+
78
+ if not img_data or "image" not in img_data:
79
+ print(f"⚠️ No image data at xref {xref}")
80
+ continue
81
+
82
+ # Encode to base64
83
+ image_bytes = img_data["image"]
84
+ img_base64 = base64.b64encode(image_bytes).decode()
85
+
86
+ images.append({
87
+ "type": "image",
88
+ "format": img_data.get("ext", "png"),
89
+ "base64": img_base64,
90
+ "page": page_num,
91
+ "index": img_index,
92
+ "xref": xref
93
+ })
94
+
95
+ print(f"✅ Image {img_index + 1}/{len(image_list)}")
96
+
97
+ except ValueError as e:
98
+ if "bad xref" in str(e).lower():
99
+ print(f"⚠️ Bad xref {xref}: {e}")
100
+ else:
101
+ print(f"⚠️ Error at xref {xref}: {e}")
102
+ continue
103
+
104
+ except Exception as e:
105
+ print(f"⚠️ Error extracting image {img_index}: {e}")
106
+ continue
107
+
108
+ doc.close()
109
+
110
+ except Exception as e:
111
+ print(f"❌ Error in _extract_images_from_page: {e}")
112
+
113
+ return images
114
+
115
+ def _extract_tables_from_page(self, pdf_path: str, page_num: int) -> List[Dict]:
116
+ """Extract tables from specific page using pdfplumber."""
117
+ tables = []
118
+ try:
119
+ with pdfplumber.open(pdf_path) as pdf:
120
+ page = pdf.pages[page_num]
121
+ extracted_tables = page.extract_tables()
122
+
123
+ for table_idx, table in enumerate(extracted_tables or []):
124
+ # Convert table to markdown format
125
+ table_md = self._table_to_markdown(table)
126
+ tables.append({
127
+ "type": "table",
128
+ "content": table_md,
129
+ "page": page_num,
130
+ "index": table_idx
131
+ })
132
+ except Exception as e:
133
+ print(f"Error extracting tables from page {page_num}: {e}")
134
+
135
+ return tables
136
+
137
+ def _table_to_markdown(self, table: List[List]) -> str:
138
+ """Convert table to markdown format."""
139
+ if not table:
140
+ return ""
141
+
142
+ md = "| " + " | ".join(str(cell or "") for cell in table[0]) + " |\n"
143
+ md += "| " + " | ".join(["---"] * len(table[0])) + " |\n"
144
+
145
+ for row in table[1:]:
146
+ md += "| " + " | ".join(str(cell or "") for cell in row) + " |\n"
147
+
148
+ return md
149
+
150
+ def extract_pdf_content(self, pdf_path: str) -> Dict:
151
+ """
152
+ Extract all content from PDF (text, tables, images).
153
+ Uses cache to avoid reprocessing.
154
+ """
155
+ pdf_name = os.path.basename(pdf_path)
156
+ file_hash = self._get_file_hash(pdf_path)
157
+
158
+ # Check cache
159
+ if pdf_name in self.cache and self.cache[pdf_name].get("hash") == file_hash:
160
+ print(f"Using cached data for {pdf_name}")
161
+ return self.cache[pdf_name]["content"]
162
+
163
+ print(f"Processing PDF: {pdf_name}")
164
+
165
+ content = {
166
+ "filename": pdf_name,
167
+ "pages": []
168
+ }
169
+
170
+ try:
171
+ # Count pages
172
+ with pdfplumber.open(pdf_path) as pdf:
173
+ num_pages = len(pdf.pages)
174
+
175
+ # Process each page
176
+ for page_num in range(num_pages):
177
+ page_content = {
178
+ "page_number": page_num + 1,
179
+ "text": "",
180
+ "tables": [],
181
+ "images": []
182
+ }
183
+
184
+ # Extract text
185
+ with pdfplumber.open(pdf_path) as pdf:
186
+ page = pdf.pages[page_num]
187
+ page_content["text"] = page.extract_text() or ""
188
+
189
+ # Extract tables
190
+ page_content["tables"] = self._extract_tables_from_page(pdf_path, page_num)
191
+
192
+ # Extract images
193
+ page_content["images"] = self._extract_images_from_page(pdf_path, page_num)
194
+
195
+ content["pages"].append(page_content)
196
+
197
+ except Exception as e:
198
+ print(f"Error processing {pdf_path}: {e}")
199
+ return None
200
+
201
+ # Cache the result
202
+ self.cache[pdf_name] = {
203
+ "hash": file_hash,
204
+ "content": content
205
+ }
206
+ self._save_cache()
207
+
208
+ return content
209
+
210
+ def process_all_pdfs(self, pdf_dir: str = None) -> List[Dict]:
211
+ """Process all PDFs in directory."""
212
+ if pdf_dir is None:
213
+ pdf_dir = self.pdf_dir
214
+
215
+ all_content = []
216
+ pdf_files = list(Path(pdf_dir).glob("*.pdf"))
217
+
218
+ if not pdf_files:
219
+ print(f"No PDF files found in {pdf_dir}")
220
+ return all_content
221
+
222
+ for pdf_file in pdf_files:
223
+ content = self.extract_pdf_content(str(pdf_file))
224
+ if content:
225
+ all_content.append(content)
226
+
227
+ return all_content
228
+
229
+
230
+ def prepare_documents_for_embedding(pdf_content: Dict) -> List[Tuple[str, Dict]]:
231
+ """
232
+ Prepare extracted PDF content for embedding.
233
+ Returns list of (text, metadata) tuples.
234
+ """
235
+ documents = []
236
+
237
+ for page in pdf_content.get("pages", []):
238
+ page_num = page.get("page_number")
239
+ filename = pdf_content.get("filename")
240
+
241
+ # Add text chunks
242
+ if page.get("text"):
243
+ documents.append((
244
+ page["text"],
245
+ {
246
+ "type": "text",
247
+ "page": page_num,
248
+ "filename": filename
249
+ }
250
+ ))
251
+
252
+ # Add table summaries
253
+ for table in page.get("tables", []):
254
+ documents.append((
255
+ f"Table on page {page_num}:\n{table['content']}",
256
+ {
257
+ "type": "table",
258
+ "page": page_num,
259
+ "filename": filename
260
+ }
261
+ ))
262
+
263
+ # Add image descriptions (we'll get these from OpenAI)
264
+ for image in page.get("images", []):
265
+ documents.append((
266
+ f"Image on page {page_num}",
267
+ {
268
+ "type": "image",
269
+ "page": page_num,
270
+ "filename": filename,
271
+ "image_base64": image.get("base64"),
272
+ "image_format": image.get("format")
273
+ }
274
+ ))
275
+
276
+ return documents
src/rag_chain.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import ChatOpenAI
2
+ from langchain.chains import RetrievalQA
3
+ from langchain_core.prompts import PromptTemplate
4
+ from typing import Optional
5
+ import os
6
+
7
+ class RAGChain:
8
+ """RAG chain using OpenAI API with Russian language support."""
9
+
10
+ def __init__(self,
11
+ retriever,
12
+ model_name: str = "gpt-4o-mini",
13
+ temperature: float = 0.3,
14
+ api_key: Optional[str] = None):
15
+ """
16
+ Initialize RAG chain.
17
+
18
+ Args:
19
+ retriever: LangChain retriever (from vector store)
20
+ model_name: OpenAI model name
21
+ temperature: Temperature for LLM
22
+ api_key: OpenAI API key
23
+ """
24
+ self.llm = ChatOpenAI(
25
+ model_name=model_name,
26
+ temperature=temperature,
27
+ api_key=api_key or os.getenv("OPENAI_API_KEY"),
28
+ max_tokens=1024
29
+ )
30
+
31
+ self.retriever = retriever
32
+
33
+ # Custom prompt for Russian language
34
+ self.prompt_template = PromptTemplate(
35
+ template="""Вы - полезный ассистент, специализирующийся на анализе документов.
36
+
37
+ Используя следующий контекст из документов, ответьте на вопрос.
38
+
39
+ Контекст:
40
+ {context}
41
+
42
+ Вопрос: {question}
43
+
44
+ Инструкции:
45
+ 1. Ответьте только на основе информации из контекста
46
+ 2. Если информация не найдена в контексте, скажите "Информация не найдена в документах"
47
+ 3. Ответьте на русском языке
48
+ 4. Будьте кратким и точным
49
+ 5. Цитируйте источники если возможно
50
+
51
+ Ответ:""",
52
+ input_variables=["context", "question"]
53
+ )
54
+
55
+ # Create RetrievalQA chain
56
+ self.chain = RetrievalQA.from_chain_type(
57
+ llm=self.llm,
58
+ chain_type="stuff",
59
+ retriever=self.retriever,
60
+ return_source_documents=True,
61
+ chain_type_kwargs={"prompt": self.prompt_template}
62
+ )
63
+
64
+ def query(self, question: str) -> dict:
65
+ """
66
+ Query the RAG chain.
67
+
68
+ Args:
69
+ question: User question (can be in any language)
70
+
71
+ Returns:
72
+ Dictionary with answer and source documents
73
+ """
74
+ try:
75
+ result = self.chain.invoke({"query": question})
76
+
77
+ return {
78
+ "answer": result.get("result", ""),
79
+ "sources": [
80
+ {
81
+ "content": doc.page_content[:200], # First 200 chars
82
+ "metadata": doc.metadata
83
+ }
84
+ for doc in result.get("source_documents", [])
85
+ ]
86
+ }
87
+
88
+ except Exception as e:
89
+ return {
90
+ "answer": f"Ошибка при обработке запроса: {str(e)}",
91
+ "sources": []
92
+ }
93
+
94
+ def query_with_context(self, question: str, context_limit: int = 5) -> dict:
95
+ """
96
+ Query with explicit context retrieval.
97
+
98
+ Args:
99
+ question: User question
100
+ context_limit: Number of context chunks to retrieve
101
+
102
+ Returns:
103
+ Dictionary with answer and context
104
+ """
105
+ # Retrieve relevant documents
106
+ relevant_docs = self.retriever.get_relevant_documents(
107
+ question,
108
+ search_kwargs={"k": context_limit}
109
+ )
110
+
111
+ # Format context
112
+ context = "\n\n".join([
113
+ f"Источник: {doc.metadata}\n{doc.page_content}"
114
+ for doc in relevant_docs
115
+ ])
116
+
117
+ # Create prompt
118
+ prompt = self.prompt_template.format(context=context, question=question)
119
+
120
+ # Get response
121
+ response = self.llm.invoke(prompt)
122
+
123
+ return {
124
+ "answer": response.content,
125
+ "context_documents": [
126
+ {
127
+ "content": doc.page_content[:300],
128
+ "metadata": doc.metadata
129
+ }
130
+ for doc in relevant_docs
131
+ ]
132
+ }
src/vectorstore_manager.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from chromadb.config import Settings
3
+ from langchain_chroma import Chroma
4
+ from langchain_core.documents import Document
5
+ from typing import List, Dict, Optional
6
+ import os
7
+
8
+ class VectorStoreManager:
9
+ """Manages ChromaDB vector store for persistent storage."""
10
+
11
+ def __init__(self,
12
+ persist_dir: str = "./chroma_db",
13
+ collection_name: str = "pdf_documents",
14
+ embeddings=None):
15
+ """
16
+ Initialize vector store.
17
+
18
+ Args:
19
+ persist_dir: Directory for persistent storage
20
+ collection_name: Name of the collection
21
+ embeddings: LangChain embeddings instance
22
+ """
23
+ self.persist_dir = persist_dir
24
+ self.collection_name = collection_name
25
+ self.embeddings = embeddings
26
+
27
+ os.makedirs(persist_dir, exist_ok=True)
28
+
29
+ # Initialize ChromaDB persistent client
30
+ self.client = chromadb.PersistentClient(path=persist_dir)
31
+
32
+ # Initialize LangChain Chroma wrapper
33
+ self.vector_store = Chroma(
34
+ client=self.client,
35
+ collection_name=collection_name,
36
+ embedding_function=embeddings,
37
+ persist_directory=persist_dir
38
+ )
39
+
40
+ print(f"Vector store initialized: {persist_dir}/{collection_name}")
41
+
42
+ def add_documents(self, documents: List[Document], batch_size: int = 50):
43
+ """
44
+ Add documents to vector store.
45
+
46
+ Args:
47
+ documents: List of LangChain Document objects
48
+ batch_size: Number of documents per batch
49
+ """
50
+ # Process in batches
51
+ for i in range(0, len(documents), batch_size):
52
+ batch = documents[i:i + batch_size]
53
+ try:
54
+ self.vector_store.add_documents(batch)
55
+ print(f"Added {len(batch)} documents (batch {i//batch_size + 1})")
56
+ except Exception as e:
57
+ print(f"Error adding documents: {e}")
58
+
59
+ def search(self, query: str, k: int = 5) -> List[Dict]:
60
+ """
61
+ Search for similar documents.
62
+
63
+ Args:
64
+ query: Search query
65
+ k: Number of results to return
66
+
67
+ Returns:
68
+ List of documents with scores
69
+ """
70
+ results = self.vector_store.similarity_search_with_score(query, k=k)
71
+
72
+ search_results = []
73
+ for doc, score in results:
74
+ search_results.append({
75
+ "content": doc.page_content,
76
+ "metadata": doc.metadata,
77
+ "similarity": score
78
+ })
79
+
80
+ return search_results
81
+
82
+ def get_retriever(self, search_kwargs: Optional[Dict] = None):
83
+ """Get retriever for RAG chain."""
84
+ if search_kwargs is None:
85
+ search_kwargs = {"k": 5}
86
+
87
+ return self.vector_store.as_retriever(search_kwargs=search_kwargs)
88
+
89
+ def collection_count(self) -> int:
90
+ """Get number of documents in collection."""
91
+ try:
92
+ collection = self.client.get_collection(self.collection_name)
93
+ return collection.count()
94
+ except Exception as e:
95
+ print(f"Error getting collection count: {e}")
96
+ return 0
97
+
98
+ def clear_collection(self):
99
+ """Clear all documents from collection."""
100
+ try:
101
+ self.client.delete_collection(self.collection_name)
102
+ self.vector_store = Chroma(
103
+ client=self.client,
104
+ collection_name=self.collection_name,
105
+ embedding_function=self.embeddings,
106
+ persist_directory=self.persist_dir
107
+ )
108
+ print(f"Collection cleared: {self.collection_name}")
109
+ except Exception as e:
110
+ print(f"Error clearing collection: {e}")