MrSimple07 commited on
Commit
65025a2
·
1 Parent(s): 3b72f75

new version of rag

Browse files
Files changed (1) hide show
  1. app.py +196 -63
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  import zipfile
 
4
  from typing import List, Dict, Any
5
  import pandas as pd
6
  from huggingface_hub import hf_hub_download, list_repo_files
@@ -16,6 +17,16 @@ from llama_index.llms.openai import OpenAI
16
  import gradio as gr
17
  import sys
18
 
 
 
 
 
 
 
 
 
 
 
19
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
20
  OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
21
  HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
@@ -60,71 +71,121 @@ TABLE_MAX_ROWS_PER_CHUNK = 30
60
 
61
  os.makedirs(DOWNLOAD_DIR, exist_ok=True)
62
 
 
 
 
 
 
 
 
 
 
 
63
  def get_llm_model(model_name):
64
  try:
 
65
  model_config = AVAILABLE_MODELS.get(model_name)
66
  if not model_config:
 
67
  model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
68
 
69
  if not model_config.get("api_key"):
70
- raise Exception(f"API ключ не найден для модели {model_name}")
71
 
72
  if model_config["provider"] == "google":
73
- # Fix: Remove image_config parameter or set it properly
74
- return GoogleGenAI(
75
  model=model_config["model_name"],
76
- api_key=model_config["api_key"],
77
- # Don't pass image_config=None
78
  )
 
 
79
  elif model_config["provider"] == "openai":
80
- return OpenAI(
81
  model=model_config["model_name"],
82
  api_key=model_config["api_key"]
83
  )
 
 
84
  else:
85
- raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
86
 
87
  except Exception as e:
 
 
88
  return GoogleGenAI(
89
  model="gemini-2.0-flash",
90
  api_key=GOOGLE_API_KEY
91
  )
92
 
93
  def get_embedding_model():
 
94
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
95
- return HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
96
 
97
  def list_zip_files_in_repo(repo_id: str) -> List[str]:
98
- files = list_repo_files(repo_id, repo_type="dataset", token=HF_TOKEN) # Add repo_type="dataset"
99
- return [f for f in files if f.startswith(JSON_FILES_DIR) and f.endswith('.zip')]
 
 
 
100
 
101
  def download_file_from_hf(repo_id: str, path_in_repo: str, dest_dir: str) -> str:
 
102
  local_path = hf_hub_download(
103
  repo_id=repo_id,
104
  filename=path_in_repo,
105
  repo_type="dataset",
106
  token=HF_TOKEN,
107
- local_dir=dest_dir # Add this to download directly to dest_dir
108
  )
109
- return local_path # Return the path directly
 
110
 
111
  def read_jsons_from_zip(zip_path: str) -> List[Dict[str, Any]]:
 
112
  docs = []
 
 
 
113
  with zipfile.ZipFile(zip_path, 'r') as z:
114
- for name in z.namelist():
115
- if name.lower().endswith('.json'):
 
 
 
116
  with z.open(name) as f:
117
- try:
118
- text = f.read().decode('utf-8')
119
- data = json.loads(text)
120
- docs.append(data)
121
- except Exception as e:
122
- print(f"Failed to load {name} in {zip_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  return docs
124
 
125
  def chunk_text_field(text: str, doc_meta: Dict[str, Any], splitter: SentenceSplitter) -> List[Document]:
126
  nodes = splitter.split_text(text)
127
  chunks = []
 
128
  for i, node_text in enumerate(nodes):
129
  md = dict(doc_meta)
130
  md.update({
@@ -132,17 +193,27 @@ def chunk_text_field(text: str, doc_meta: Dict[str, Any], splitter: SentenceSpli
132
  'chunk_type': 'text'
133
  })
134
  chunks.append(Document(text=node_text, metadata=md))
 
 
 
135
  return chunks
136
 
137
  def chunk_table(table: Dict[str, Any], table_meta: Dict[str, Any], max_rows: int = TABLE_MAX_ROWS_PER_CHUNK) -> List[Document]:
138
  headers = table.get('headers') or []
139
  rows = table.get('data') or []
 
 
 
140
  if not rows:
141
  text = table.get('table_description') or table.get('table_title') or ''
142
  md = {**table_meta, 'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_single"}
 
 
143
  return [Document(text=text, metadata=md)]
144
 
145
  chunks = []
 
 
146
  for i in range(0, len(rows), max_rows):
147
  block = rows[i:i+max_rows]
148
  lines = []
@@ -155,6 +226,9 @@ def chunk_table(table: Dict[str, Any], table_meta: Dict[str, Any], max_rows: int
155
  md = dict(table_meta)
156
  md.update({'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_{i // max_rows}"})
157
  chunks.append(Document(text=chunk_text, metadata=md))
 
 
 
158
  return chunks
159
 
160
  def chunk_image(image_entry: Dict[str, Any], image_meta: Dict[str, Any]) -> Document:
@@ -163,68 +237,116 @@ def chunk_image(image_entry: Dict[str, Any], image_meta: Dict[str, Any]) -> Docu
163
  txt += f"Файл: {image_entry.get('Файл изображения') or image_entry.get('file','')}."
164
  md = dict(image_meta)
165
  md.update({'chunk_type': 'image', 'chunk_id': f"{image_meta.get('document_id')}_image_{image_entry.get('№ Изображения','0')}"})
 
 
 
166
  return Document(text=txt, metadata=md)
167
 
168
  def build_chunks_from_repo(repo_id: str) -> List[Document]:
 
 
 
 
169
  zip_paths = list_zip_files_in_repo(repo_id)
170
- print(f"Found {len(zip_paths)} zip files under {JSON_FILES_DIR} in repo {repo_id}")
171
 
172
  splitter = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
 
 
173
  all_chunks = []
174
 
175
- for remote_path in zip_paths:
176
- print(f"Downloading {remote_path}...")
177
  local_zip = download_file_from_hf(repo_id, remote_path, DOWNLOAD_DIR)
178
- print(f"Parsing {local_zip}...")
179
  json_docs = read_jsons_from_zip(local_zip)
180
- for doc in json_docs:
 
 
 
 
181
  doc_meta = doc.get('document_metadata', {})
182
- doc_id = doc_meta.get('document_id') or doc_meta.get('document_name') or 'unknown_doc'
183
  base_meta = {'document_id': doc_id, 'document_name': doc_meta.get('document_name','')}
184
-
185
- for sec in doc.get('sections', []):
186
- sec_meta = dict(base_meta)
187
- sec_meta.update({'section_id': sec.get('section_id'), 'section_title': None})
188
- text = sec.get('section_text') or sec.get('text') or ''
189
- if text and text.strip():
190
- chunks = chunk_text_field(text, sec_meta, splitter)
191
- all_chunks.extend(chunks)
192
-
193
- for sheet in doc.get('sheets', []) + doc.get('tables', []) if (doc.get('sheets') or doc.get('tables')) else []:
194
- table_meta = dict(base_meta)
195
- table_meta.update({
196
- 'sheet_name': sheet.get('sheet_name') or sheet.get('table_title'),
197
- 'section': sheet.get('section'),
198
- 'table_number': sheet.get('table_number'),
199
- 'table_title': sheet.get('table_title')
200
- })
201
- table_chunks = chunk_table(sheet, table_meta, max_rows=TABLE_MAX_ROWS_PER_CHUNK)
202
- all_chunks.extend(table_chunks)
203
-
204
- for img in doc.get('images', []) or doc.get('image_data', []) or doc.get('image_entries', []):
205
- img_meta = dict(base_meta)
206
- chunk = chunk_image(img, img_meta)
207
- all_chunks.append(chunk)
208
-
209
- print(f"Built total {len(all_chunks)} chunks")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  return all_chunks
211
 
212
  def create_hybrid_index(documents):
213
- print("Creating vector index...")
 
214
  vector_index = VectorStoreIndex.from_documents(documents)
 
215
 
216
- print("Creating keyword index...")
217
  keyword_index = KeywordTableIndex.from_documents(documents)
 
218
 
219
  return vector_index, keyword_index
220
 
221
  def create_fusion_retriever(vector_index, keyword_index, documents):
 
 
222
  vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
 
223
 
224
  bm25_retriever = BM25Retriever.from_defaults(
225
  docstore=vector_index.docstore,
226
  similarity_top_k=5
227
  )
 
228
 
229
  fusion_retriever = QueryFusionRetriever(
230
  [vector_retriever, bm25_retriever],
@@ -233,26 +355,32 @@ def create_fusion_retriever(vector_index, keyword_index, documents):
233
  mode="reciprocal_rerank",
234
  use_async=False
235
  )
 
236
 
237
  return fusion_retriever
238
 
239
  def create_query_engine(vector_index, keyword_index, documents):
 
240
  fusion_retriever = create_fusion_retriever(vector_index, keyword_index, documents)
241
 
242
  response_synthesizer = get_response_synthesizer(
243
  response_mode=ResponseMode.COMPACT,
244
  use_async=False
245
  )
 
246
 
247
  query_engine = RetrieverQueryEngine(
248
  retriever=fusion_retriever,
249
  response_synthesizer=response_synthesizer
250
  )
 
251
 
252
  return query_engine
253
 
254
  def initialize_system():
255
- print("Initializing system...")
 
 
256
 
257
  embed_model = get_embedding_model()
258
  llm = get_llm_model(DEFAULT_MODEL)
@@ -261,17 +389,17 @@ def initialize_system():
261
  Settings.llm = llm
262
  Settings.chunk_size = CHUNK_SIZE
263
  Settings.chunk_overlap = CHUNK_OVERLAP
 
264
 
265
- print("Loading documents...")
266
  documents = build_chunks_from_repo(HF_REPO_ID)
267
 
268
- print("Creating indices...")
269
  vector_index, keyword_index = create_hybrid_index(documents)
270
 
271
- print("Creating query engine...")
272
  query_engine = create_query_engine(vector_index, keyword_index, documents)
273
 
274
- print("System initialized successfully!")
 
 
275
  return query_engine, vector_index, keyword_index, documents
276
 
277
  def answer_question(question, query_engine):
@@ -279,7 +407,9 @@ def answer_question(question, query_engine):
279
  return "<div style='color: black;'>Please enter a question</div>"
280
 
281
  try:
 
282
  response = query_engine.query(question)
 
283
 
284
  answer_html = f"""
285
  <div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; color: black;'>
@@ -303,18 +433,21 @@ def answer_question(question, query_engine):
303
  return answer_html, sources_html
304
 
305
  except Exception as e:
 
306
  error_html = f"<div style='color: red;'>Error: {str(e)}</div>"
307
  return error_html, error_html
308
 
309
  def switch_model(model_name, vector_index, keyword_index, documents):
310
  try:
311
- print(f"Switching to model: {model_name}")
312
  new_llm = get_llm_model(model_name)
313
  Settings.llm = new_llm
314
 
315
  new_query_engine = create_query_engine(vector_index, keyword_index, documents)
 
316
  return new_query_engine, f"✅ Model switched to: {model_name}"
317
  except Exception as e:
 
318
  return None, f"❌ Error: {str(e)}"
319
 
320
  query_engine = None
@@ -394,12 +527,12 @@ def create_interface():
394
  def main():
395
  global query_engine, vector_index, keyword_index, documents
396
 
397
- print("Starting AIEXP - AI Expert for Regulatory Documentation")
398
 
399
  query_engine, vector_index, keyword_index, documents = initialize_system()
400
 
401
  if query_engine:
402
- print("Launching web interface...")
403
  demo = create_interface()
404
  demo.launch(
405
  server_name="0.0.0.0",
@@ -407,7 +540,7 @@ def main():
407
  share=True
408
  )
409
  else:
410
- print("Failed to initialize system")
411
  sys.exit(1)
412
 
413
  if __name__ == "__main__":
 
1
  import os
2
  import json
3
  import zipfile
4
+ import logging
5
  from typing import List, Dict, Any
6
  import pandas as pd
7
  from huggingface_hub import hf_hub_download, list_repo_files
 
17
  import gradio as gr
18
  import sys
19
 
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s - %(levelname)s - %(message)s',
23
+ handlers=[
24
+ logging.FileHandler('rag_system.log'),
25
+ logging.StreamHandler(sys.stdout)
26
+ ]
27
+ )
28
+ logger = logging.getLogger(__name__)
29
+
30
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
31
  OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
32
  HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
 
71
 
72
  os.makedirs(DOWNLOAD_DIR, exist_ok=True)
73
 
74
+ stats = {
75
+ 'total_documents': 0,
76
+ 'total_text_chunks': 0,
77
+ 'total_tables': 0,
78
+ 'total_table_chunks': 0,
79
+ 'total_images': 0,
80
+ 'failed_files': 0,
81
+ 'encoding_errors': []
82
+ }
83
+
84
  def get_llm_model(model_name):
85
  try:
86
+ logger.info(f"Initializing LLM model: {model_name}")
87
  model_config = AVAILABLE_MODELS.get(model_name)
88
  if not model_config:
89
+ logger.warning(f"Model {model_name} not found, using default: {DEFAULT_MODEL}")
90
  model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
91
 
92
  if not model_config.get("api_key"):
93
+ raise Exception(f"API key not found for model {model_name}")
94
 
95
  if model_config["provider"] == "google":
96
+ llm = GoogleGenAI(
 
97
  model=model_config["model_name"],
98
+ api_key=model_config["api_key"]
 
99
  )
100
+ logger.info(f"Successfully initialized Google model: {model_config['model_name']}")
101
+ return llm
102
  elif model_config["provider"] == "openai":
103
+ llm = OpenAI(
104
  model=model_config["model_name"],
105
  api_key=model_config["api_key"]
106
  )
107
+ logger.info(f"Successfully initialized OpenAI model: {model_config['model_name']}")
108
+ return llm
109
  else:
110
+ raise Exception(f"Unsupported provider: {model_config['provider']}")
111
 
112
  except Exception as e:
113
+ logger.error(f"Error initializing model {model_name}: {e}")
114
+ logger.info("Falling back to default Gemini model")
115
  return GoogleGenAI(
116
  model="gemini-2.0-flash",
117
  api_key=GOOGLE_API_KEY
118
  )
119
 
120
  def get_embedding_model():
121
+ logger.info("Initializing embedding model: all-MiniLM-L6-v2")
122
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
123
+ embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
124
+ logger.info("Embedding model initialized successfully")
125
+ return embed_model
126
 
127
  def list_zip_files_in_repo(repo_id: str) -> List[str]:
128
+ logger.info(f"Listing files in repository: {repo_id}")
129
+ files = list_repo_files(repo_id, repo_type="dataset", token=HF_TOKEN)
130
+ zip_files = [f for f in files if f.startswith(JSON_FILES_DIR) and f.endswith('.zip')]
131
+ logger.info(f"Found {len(zip_files)} zip files in {JSON_FILES_DIR} directory")
132
+ return zip_files
133
 
134
  def download_file_from_hf(repo_id: str, path_in_repo: str, dest_dir: str) -> str:
135
+ logger.info(f"Downloading file: {path_in_repo}")
136
  local_path = hf_hub_download(
137
  repo_id=repo_id,
138
  filename=path_in_repo,
139
  repo_type="dataset",
140
  token=HF_TOKEN,
141
+ local_dir=dest_dir
142
  )
143
+ logger.info(f"File downloaded to: {local_path}")
144
+ return local_path
145
 
146
  def read_jsons_from_zip(zip_path: str) -> List[Dict[str, Any]]:
147
+ logger.info(f"Reading JSON files from zip: {zip_path}")
148
  docs = []
149
+ json_count = 0
150
+ failed_count = 0
151
+
152
  with zipfile.ZipFile(zip_path, 'r') as z:
153
+ json_files = [name for name in z.namelist() if name.lower().endswith('.json')]
154
+ logger.info(f"Found {len(json_files)} JSON files in zip")
155
+
156
+ for name in json_files:
157
+ try:
158
  with z.open(name) as f:
159
+ raw_bytes = f.read()
160
+
161
+ for encoding in ['utf-8', 'utf-8-sig', 'latin-1', 'cp1251', 'windows-1251']:
162
+ try:
163
+ text = raw_bytes.decode(encoding)
164
+ data = json.loads(text)
165
+ docs.append(data)
166
+ json_count += 1
167
+ logger.debug(f"Successfully loaded {name} with encoding {encoding}")
168
+ break
169
+ except (UnicodeDecodeError, json.JSONDecodeError):
170
+ continue
171
+ else:
172
+ failed_count += 1
173
+ stats['failed_files'] += 1
174
+ stats['encoding_errors'].append(name)
175
+ logger.warning(f"Failed to load {name} - tried all encodings")
176
+
177
+ except Exception as e:
178
+ failed_count += 1
179
+ stats['failed_files'] += 1
180
+ logger.error(f"Error processing {name}: {e}")
181
+
182
+ logger.info(f"Successfully loaded {json_count} JSON files, failed: {failed_count}")
183
  return docs
184
 
185
  def chunk_text_field(text: str, doc_meta: Dict[str, Any], splitter: SentenceSplitter) -> List[Document]:
186
  nodes = splitter.split_text(text)
187
  chunks = []
188
+
189
  for i, node_text in enumerate(nodes):
190
  md = dict(doc_meta)
191
  md.update({
 
193
  'chunk_type': 'text'
194
  })
195
  chunks.append(Document(text=node_text, metadata=md))
196
+
197
+ stats['total_text_chunks'] += len(chunks)
198
+ logger.debug(f"Created {len(chunks)} text chunks for document {doc_meta.get('document_id')}")
199
  return chunks
200
 
201
  def chunk_table(table: Dict[str, Any], table_meta: Dict[str, Any], max_rows: int = TABLE_MAX_ROWS_PER_CHUNK) -> List[Document]:
202
  headers = table.get('headers') or []
203
  rows = table.get('data') or []
204
+
205
+ stats['total_tables'] += 1
206
+
207
  if not rows:
208
  text = table.get('table_description') or table.get('table_title') or ''
209
  md = {**table_meta, 'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_single"}
210
+ stats['total_table_chunks'] += 1
211
+ logger.debug(f"Created single chunk for empty table: {table_meta.get('table_title')}")
212
  return [Document(text=text, metadata=md)]
213
 
214
  chunks = []
215
+ num_chunks = (len(rows) + max_rows - 1) // max_rows
216
+
217
  for i in range(0, len(rows), max_rows):
218
  block = rows[i:i+max_rows]
219
  lines = []
 
226
  md = dict(table_meta)
227
  md.update({'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_{i // max_rows}"})
228
  chunks.append(Document(text=chunk_text, metadata=md))
229
+
230
+ stats['total_table_chunks'] += len(chunks)
231
+ logger.debug(f"Table '{table_meta.get('table_title')}': {len(rows)} rows split into {len(chunks)} chunks")
232
  return chunks
233
 
234
  def chunk_image(image_entry: Dict[str, Any], image_meta: Dict[str, Any]) -> Document:
 
237
  txt += f"Файл: {image_entry.get('Файл изображения') or image_entry.get('file','')}."
238
  md = dict(image_meta)
239
  md.update({'chunk_type': 'image', 'chunk_id': f"{image_meta.get('document_id')}_image_{image_entry.get('№ Изображения','0')}"})
240
+
241
+ stats['total_images'] += 1
242
+ logger.debug(f"Created image chunk: {image_entry.get('Название изображения', 'unknown')}")
243
  return Document(text=txt, metadata=md)
244
 
245
  def build_chunks_from_repo(repo_id: str) -> List[Document]:
246
+ logger.info("=" * 80)
247
+ logger.info("Starting document processing from repository")
248
+ logger.info("=" * 80)
249
+
250
  zip_paths = list_zip_files_in_repo(repo_id)
251
+ logger.info(f"Total zip files to process: {len(zip_paths)}")
252
 
253
  splitter = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
254
+ logger.info(f"Text splitter configured: chunk_size={CHUNK_SIZE}, chunk_overlap={CHUNK_OVERLAP}")
255
+
256
  all_chunks = []
257
 
258
+ for zip_idx, remote_path in enumerate(zip_paths, 1):
259
+ logger.info(f"\n[{zip_idx}/{len(zip_paths)}] Processing zip file: {remote_path}")
260
  local_zip = download_file_from_hf(repo_id, remote_path, DOWNLOAD_DIR)
 
261
  json_docs = read_jsons_from_zip(local_zip)
262
+
263
+ logger.info(f"Processing {len(json_docs)} documents from {remote_path}")
264
+ stats['total_documents'] += len(json_docs)
265
+
266
+ for doc_idx, doc in enumerate(json_docs, 1):
267
  doc_meta = doc.get('document_metadata', {})
268
+ doc_id = doc_meta.get('document_id') or doc_meta.get('document_name') or f'unknown_doc_{doc_idx}'
269
  base_meta = {'document_id': doc_id, 'document_name': doc_meta.get('document_name','')}
270
+
271
+ logger.info(f" Document [{doc_idx}/{len(json_docs)}]: {doc_id}")
272
+
273
+ sections = doc.get('sections', [])
274
+ if sections:
275
+ logger.info(f" Processing {len(sections)} text sections")
276
+ for sec in sections:
277
+ sec_meta = dict(base_meta)
278
+ sec_meta.update({'section_id': sec.get('section_id'), 'section_title': None})
279
+ text = sec.get('section_text') or sec.get('text') or ''
280
+ if text and text.strip():
281
+ chunks = chunk_text_field(text, sec_meta, splitter)
282
+ all_chunks.extend(chunks)
283
+
284
+ tables = doc.get('sheets', []) + doc.get('tables', []) if (doc.get('sheets') or doc.get('tables')) else []
285
+ if tables:
286
+ logger.info(f" Processing {len(tables)} tables")
287
+ for tbl_idx, sheet in enumerate(tables, 1):
288
+ table_meta = dict(base_meta)
289
+ table_meta.update({
290
+ 'sheet_name': sheet.get('sheet_name') or sheet.get('table_title'),
291
+ 'section': sheet.get('section'),
292
+ 'table_number': sheet.get('table_number'),
293
+ 'table_title': sheet.get('table_title')
294
+ })
295
+ table_chunks = chunk_table(sheet, table_meta, max_rows=TABLE_MAX_ROWS_PER_CHUNK)
296
+ all_chunks.extend(table_chunks)
297
+
298
+ images = doc.get('images', []) or doc.get('image_data', []) or doc.get('image_entries', [])
299
+ if images:
300
+ logger.info(f" Processing {len(images)} images")
301
+ for img in images:
302
+ img_meta = dict(base_meta)
303
+ chunk = chunk_image(img, img_meta)
304
+ all_chunks.append(chunk)
305
+
306
+ logger.info("\n" + "=" * 80)
307
+ logger.info("PROCESSING SUMMARY")
308
+ logger.info("=" * 80)
309
+ logger.info(f"Total documents processed: {stats['total_documents']}")
310
+ logger.info(f"Total text chunks created: {stats['total_text_chunks']}")
311
+ logger.info(f"Total tables processed: {stats['total_tables']}")
312
+ logger.info(f"Total table chunks created: {stats['total_table_chunks']}")
313
+ logger.info(f"Total images processed: {stats['total_images']}")
314
+ logger.info(f"Total chunks created: {len(all_chunks)}")
315
+ logger.info(f"Failed files: {stats['failed_files']}")
316
+
317
+ if stats['encoding_errors']:
318
+ logger.warning(f"Files with encoding errors ({len(stats['encoding_errors'])}):")
319
+ for err_file in stats['encoding_errors'][:10]:
320
+ logger.warning(f" - {err_file}")
321
+ if len(stats['encoding_errors']) > 10:
322
+ logger.warning(f" ... and {len(stats['encoding_errors']) - 10} more")
323
+
324
+ logger.info("=" * 80)
325
  return all_chunks
326
 
327
  def create_hybrid_index(documents):
328
+ logger.info("Creating hybrid index system")
329
+ logger.info(f"Building vector index from {len(documents)} documents")
330
  vector_index = VectorStoreIndex.from_documents(documents)
331
+ logger.info("Vector index created successfully")
332
 
333
+ logger.info("Building keyword index")
334
  keyword_index = KeywordTableIndex.from_documents(documents)
335
+ logger.info("Keyword index created successfully")
336
 
337
  return vector_index, keyword_index
338
 
339
  def create_fusion_retriever(vector_index, keyword_index, documents):
340
+ logger.info("Creating fusion retriever with multiple retrieval strategies")
341
+
342
  vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
343
+ logger.info("Vector retriever configured (top_k=5)")
344
 
345
  bm25_retriever = BM25Retriever.from_defaults(
346
  docstore=vector_index.docstore,
347
  similarity_top_k=5
348
  )
349
+ logger.info("BM25 retriever configured (top_k=5)")
350
 
351
  fusion_retriever = QueryFusionRetriever(
352
  [vector_retriever, bm25_retriever],
 
355
  mode="reciprocal_rerank",
356
  use_async=False
357
  )
358
+ logger.info("Fusion retriever created with reciprocal rerank mode")
359
 
360
  return fusion_retriever
361
 
362
  def create_query_engine(vector_index, keyword_index, documents):
363
+ logger.info("Creating query engine")
364
  fusion_retriever = create_fusion_retriever(vector_index, keyword_index, documents)
365
 
366
  response_synthesizer = get_response_synthesizer(
367
  response_mode=ResponseMode.COMPACT,
368
  use_async=False
369
  )
370
+ logger.info("Response synthesizer configured (COMPACT mode)")
371
 
372
  query_engine = RetrieverQueryEngine(
373
  retriever=fusion_retriever,
374
  response_synthesizer=response_synthesizer
375
  )
376
+ logger.info("Query engine created successfully")
377
 
378
  return query_engine
379
 
380
  def initialize_system():
381
+ logger.info("\n" + "=" * 80)
382
+ logger.info("INITIALIZING AIEXP RAG SYSTEM")
383
+ logger.info("=" * 80)
384
 
385
  embed_model = get_embedding_model()
386
  llm = get_llm_model(DEFAULT_MODEL)
 
389
  Settings.llm = llm
390
  Settings.chunk_size = CHUNK_SIZE
391
  Settings.chunk_overlap = CHUNK_OVERLAP
392
+ logger.info("Global settings configured")
393
 
 
394
  documents = build_chunks_from_repo(HF_REPO_ID)
395
 
 
396
  vector_index, keyword_index = create_hybrid_index(documents)
397
 
 
398
  query_engine = create_query_engine(vector_index, keyword_index, documents)
399
 
400
+ logger.info("=" * 80)
401
+ logger.info("SYSTEM INITIALIZATION COMPLETE")
402
+ logger.info("=" * 80)
403
  return query_engine, vector_index, keyword_index, documents
404
 
405
  def answer_question(question, query_engine):
 
407
  return "<div style='color: black;'>Please enter a question</div>"
408
 
409
  try:
410
+ logger.info(f"Processing query: {question[:100]}...")
411
  response = query_engine.query(question)
412
+ logger.info(f"Query processed, found {len(response.source_nodes)} source nodes")
413
 
414
  answer_html = f"""
415
  <div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; color: black;'>
 
433
  return answer_html, sources_html
434
 
435
  except Exception as e:
436
+ logger.error(f"Error processing query: {e}", exc_info=True)
437
  error_html = f"<div style='color: red;'>Error: {str(e)}</div>"
438
  return error_html, error_html
439
 
440
  def switch_model(model_name, vector_index, keyword_index, documents):
441
  try:
442
+ logger.info(f"Switching to model: {model_name}")
443
  new_llm = get_llm_model(model_name)
444
  Settings.llm = new_llm
445
 
446
  new_query_engine = create_query_engine(vector_index, keyword_index, documents)
447
+ logger.info(f"Successfully switched to model: {model_name}")
448
  return new_query_engine, f"✅ Model switched to: {model_name}"
449
  except Exception as e:
450
+ logger.error(f"Error switching model: {e}")
451
  return None, f"❌ Error: {str(e)}"
452
 
453
  query_engine = None
 
527
  def main():
528
  global query_engine, vector_index, keyword_index, documents
529
 
530
+ logger.info("Starting AIEXP - AI Expert for Regulatory Documentation")
531
 
532
  query_engine, vector_index, keyword_index, documents = initialize_system()
533
 
534
  if query_engine:
535
+ logger.info("Launching web interface on port 7860")
536
  demo = create_interface()
537
  demo.launch(
538
  server_name="0.0.0.0",
 
540
  share=True
541
  )
542
  else:
543
+ logger.error("Failed to initialize system")
544
  sys.exit(1)
545
 
546
  if __name__ == "__main__":