hamxaameer commited on
Commit
0a73b6c
Β·
verified Β·
1 Parent(s): 164acc9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -80
app.py CHANGED
@@ -88,7 +88,7 @@ def initialize_embeddings():
88
  return embeddings
89
 
90
  def load_vector_store(embeddings):
91
- """Load FAISS vector store with full Pydantic bypass"""
92
  logger.info("πŸ”„ Loading FAISS vector store...")
93
 
94
  vector_store_path = CONFIG["vector_store_path"]
@@ -119,95 +119,108 @@ def load_vector_store(embeddings):
119
  return vectorstore
120
 
121
  except (KeyError, AttributeError, Exception) as e:
122
- logger.warning(f"⚠️ Pydantic version mismatch: {e}")
123
- logger.info("πŸ”„ Using custom pickle loader to bypass Pydantic...")
124
 
125
- import faiss
126
- import pickle
127
- from langchain_community.docstore.in_memory import InMemoryDocstore
128
-
129
- # Custom unpickler that bypasses Pydantic validation
130
- class PydanticBypassUnpickler(pickle.Unpickler):
131
- def find_class(self, module, name):
132
- # Redirect Pydantic Document to LangChain Document
133
- if 'pydantic' in module or name == 'Document':
134
- return Document
135
- return super().find_class(module, name)
136
-
137
- # Load FAISS index
138
- logger.info(" Loading FAISS index...")
139
- index = faiss.read_index(index_file)
 
 
 
 
 
 
140
 
141
- # Load pickle with bypass
142
- logger.info(" Loading documents with Pydantic bypass...")
143
- with open(pkl_file, "rb") as f:
144
- unpickler = PydanticBypassUnpickler(f)
 
 
 
 
 
145
 
146
- # Manually parse pickle structure
147
- try:
148
- raw_data = unpickler.load()
149
-
150
- # Extract docstore and index mapping
151
- if isinstance(raw_data, tuple) and len(raw_data) >= 2:
152
- docstore_data = raw_data[0]
153
- index_to_docstore_id = raw_data[1]
154
- else:
155
- raise ValueError("Unexpected pickle structure")
 
 
 
 
 
 
 
 
156
 
157
- # Rebuild docstore with new Document objects
158
- new_docstore_dict = {}
 
159
 
160
- if hasattr(docstore_data, '_dict'):
161
- old_docs = docstore_data._dict
162
- elif isinstance(docstore_data, dict):
163
- old_docs = docstore_data
164
- else:
165
- old_docs = {}
166
 
167
- logger.info(f" Rebuilding {len(old_docs)} documents...")
 
 
168
 
169
- for doc_id, old_doc in old_docs.items():
170
- # Extract content and metadata safely
171
- if hasattr(old_doc, 'page_content'):
172
- content = old_doc.page_content
173
- elif isinstance(old_doc, dict):
174
- content = old_doc.get('page_content', '')
175
- else:
176
- content = str(old_doc)
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- if hasattr(old_doc, 'metadata'):
179
- metadata = old_doc.metadata if isinstance(old_doc.metadata, dict) else {}
180
- elif isinstance(old_doc, dict):
181
- metadata = old_doc.get('metadata', {})
182
- else:
183
- metadata = {}
184
 
185
- # Create fresh Document without Pydantic issues
186
- new_doc = Document(
187
- page_content=content,
188
- metadata=metadata
 
 
 
189
  )
190
- new_docstore_dict[doc_id] = new_doc
191
-
192
- # Create new docstore
193
- docstore = InMemoryDocstore(new_docstore_dict)
194
-
195
- logger.info(f" βœ… Rebuilt {len(new_docstore_dict)} documents successfully")
196
-
197
- except Exception as e2:
198
- logger.error(f"❌ Custom unpickler failed: {e2}")
199
- raise
200
-
201
- # Create FAISS vectorstore
202
- vectorstore = FAISS(
203
- embedding_function=embeddings,
204
- index=index,
205
- docstore=docstore,
206
- index_to_docstore_id=index_to_docstore_id
207
- )
208
-
209
- logger.info(f"βœ… FAISS vector store loaded with custom loader")
210
- return vectorstore
211
 
212
  # ============================================================================
213
  # RAG PIPELINE FUNCTIONS
 
88
  return embeddings
89
 
90
  def load_vector_store(embeddings):
91
+ """Load FAISS vector store with Pydantic monkey-patch"""
92
  logger.info("πŸ”„ Loading FAISS vector store...")
93
 
94
  vector_store_path = CONFIG["vector_store_path"]
 
119
  return vectorstore
120
 
121
  except (KeyError, AttributeError, Exception) as e:
122
+ logger.warning(f"⚠️ Pydantic compatibility issue: {str(e)[:100]}")
123
+ logger.info("πŸ”„ Applying Pydantic monkey-patch and retrying...")
124
 
125
+ # STEP 1: Monkey-patch Pydantic to handle missing __fields_set__
126
+ try:
127
+ import pydantic.v1.main as pydantic_main
128
+
129
+ # Save original __setstate__
130
+ original_setstate = pydantic_main.BaseModel.__setstate__
131
+
132
+ def patched_setstate(self, state):
133
+ """Patched __setstate__ that handles missing __fields_set__"""
134
+ # Add missing __fields_set__ if not present
135
+ if '__fields_set__' not in state:
136
+ state['__fields_set__'] = set(state.get('__dict__', {}).keys())
137
+ # Call original
138
+ return original_setstate(self, state)
139
+
140
+ # Apply patch
141
+ pydantic_main.BaseModel.__setstate__ = patched_setstate
142
+ logger.info(" βœ… Pydantic monkey-patch applied")
143
+
144
+ except Exception as patch_error:
145
+ logger.warning(f" ⚠️ Pydantic patch failed: {patch_error}")
146
 
147
+ # STEP 2: Try loading again with patch
148
+ try:
149
+ vectorstore = FAISS.load_local(
150
+ vector_store_path,
151
+ embeddings,
152
+ allow_dangerous_deserialization=True
153
+ )
154
+ logger.info(f"βœ… FAISS vector store loaded with Pydantic patch")
155
+ return vectorstore
156
 
157
+ except Exception as e2:
158
+ logger.error(f" βœ— Still failed after patch: {str(e2)[:100]}")
159
+
160
+ # STEP 3: Last resort - manual reconstruction
161
+ logger.info("πŸ”„ Using manual reconstruction (last resort)...")
162
+
163
+ import faiss
164
+ import pickle
165
+ from langchain_community.docstore.in_memory import InMemoryDocstore
166
+
167
+ # Load FAISS index
168
+ index = faiss.read_index(index_file)
169
+ logger.info(f" βœ… FAISS index loaded")
170
+
171
+ # Load pickle with raw binary parsing
172
+ with open(pkl_file, "rb") as f:
173
+ import io
174
+ import struct
175
 
176
+ # Read raw bytes
177
+ raw_bytes = f.read()
178
+ logger.info(f" Read {len(raw_bytes)} bytes from pickle")
179
 
180
+ # Try to extract text content directly (bypass Pydantic completely)
181
+ # This is a fallback that extracts document strings
182
+ import re
 
 
 
183
 
184
+ # Find all text patterns that look like documents
185
+ text_pattern = rb'([A-Za-z0-9\s\.\,\;\:\!\?\-\'\"\(\)]{50,})'
186
+ matches = re.findall(text_pattern, raw_bytes)
187
 
188
+ if len(matches) > 100:
189
+ logger.info(f" Found {len(matches)} potential document fragments")
190
+
191
+ # Create simple documents from extracted text
192
+ new_docstore_dict = {}
193
+ index_to_docstore_id = {}
194
+
195
+ for idx, match in enumerate(matches[:15000]): # Limit to 15k docs
196
+ try:
197
+ content = match.decode('utf-8', errors='ignore').strip()
198
+ if len(content) > 50: # Only keep substantial content
199
+ doc_id = str(idx)
200
+ new_doc = Document(
201
+ page_content=content,
202
+ metadata={}
203
+ )
204
+ new_docstore_dict[doc_id] = new_doc
205
+ index_to_docstore_id[idx] = doc_id
206
+ except:
207
+ continue
208
 
209
+ logger.info(f" βœ… Reconstructed {len(new_docstore_dict)} documents from raw data")
 
 
 
 
 
210
 
211
+ docstore = InMemoryDocstore(new_docstore_dict)
212
+
213
+ vectorstore = FAISS(
214
+ embedding_function=embeddings,
215
+ index=index,
216
+ docstore=docstore,
217
+ index_to_docstore_id=index_to_docstore_id
218
  )
219
+
220
+ logger.info(f"βœ… FAISS vector store reconstructed from raw data")
221
+ return vectorstore
222
+ else:
223
+ raise Exception("Could not extract enough document content from pickle")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  # ============================================================================
226
  # RAG PIPELINE FUNCTIONS