Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

65025a2

1 Parent(s): 3b72f75

new version of rag

Browse files

Files changed (1) hide show

app.py +196 -63

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import json
 import zipfile
 from typing import List, Dict, Any
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
@@ -16,6 +17,16 @@ from llama_index.llms.openai import OpenAI
 import gradio as gr
 import sys
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
@@ -60,71 +71,121 @@ TABLE_MAX_ROWS_PER_CHUNK = 30
 os.makedirs(DOWNLOAD_DIR, exist_ok=True)
 def get_llm_model(model_name):
     try:
         model_config = AVAILABLE_MODELS.get(model_name)
         if not model_config:
             model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
         if not model_config.get("api_key"):
-            raise Exception(f"API ключ не найден для модели {model_name}")
         if model_config["provider"] == "google":
-            # Fix: Remove image_config parameter or set it properly
-            return GoogleGenAI(
                 model=model_config["model_name"],
-                api_key=model_config["api_key"],
-                # Don't pass image_config=None
             )
         elif model_config["provider"] == "openai":
-            return OpenAI(
                 model=model_config["model_name"],
                 api_key=model_config["api_key"]
             )
         else:
-            raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
     except Exception as e:
         return GoogleGenAI(
             model="gemini-2.0-flash",
             api_key=GOOGLE_API_KEY
         )
 def get_embedding_model():
     from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-    return HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
 def list_zip_files_in_repo(repo_id: str) -> List[str]:
-    files = list_repo_files(repo_id, repo_type="dataset", token=HF_TOKEN)  # Add repo_type="dataset"
-    return [f for f in files if f.startswith(JSON_FILES_DIR) and f.endswith('.zip')]
 def download_file_from_hf(repo_id: str, path_in_repo: str, dest_dir: str) -> str:
     local_path = hf_hub_download(
         repo_id=repo_id,
         filename=path_in_repo,
         repo_type="dataset",
         token=HF_TOKEN,
-        local_dir=dest_dir  # Add this to download directly to dest_dir
     )
-    return local_path  # Return the path directly
 def read_jsons_from_zip(zip_path: str) -> List[Dict[str, Any]]:
     docs = []
     with zipfile.ZipFile(zip_path, 'r') as z:
-        for name in z.namelist():
-            if name.lower().endswith('.json'):
                 with z.open(name) as f:
-                    try:
-                        text = f.read().decode('utf-8')
-                        data = json.loads(text)
-                        docs.append(data)
-                    except Exception as e:
-                        print(f"Failed to load {name} in {zip_path}: {e}")
     return docs
 def chunk_text_field(text: str, doc_meta: Dict[str, Any], splitter: SentenceSplitter) -> List[Document]:
     nodes = splitter.split_text(text)
     chunks = []
     for i, node_text in enumerate(nodes):
         md = dict(doc_meta)
         md.update({
@@ -132,17 +193,27 @@ def chunk_text_field(text: str, doc_meta: Dict[str, Any], splitter: SentenceSpli
             'chunk_type': 'text'
         })
         chunks.append(Document(text=node_text, metadata=md))
     return chunks
 def chunk_table(table: Dict[str, Any], table_meta: Dict[str, Any], max_rows: int = TABLE_MAX_ROWS_PER_CHUNK) -> List[Document]:
     headers = table.get('headers') or []
     rows = table.get('data') or []
     if not rows:
         text = table.get('table_description') or table.get('table_title') or ''
         md = {**table_meta, 'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_single"}
         return [Document(text=text, metadata=md)]
     chunks = []
     for i in range(0, len(rows), max_rows):
         block = rows[i:i+max_rows]
         lines = []
@@ -155,6 +226,9 @@ def chunk_table(table: Dict[str, Any], table_meta: Dict[str, Any], max_rows: int
         md = dict(table_meta)
         md.update({'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_{i // max_rows}"})
         chunks.append(Document(text=chunk_text, metadata=md))
     return chunks
 def chunk_image(image_entry: Dict[str, Any], image_meta: Dict[str, Any]) -> Document:
@@ -163,68 +237,116 @@ def chunk_image(image_entry: Dict[str, Any], image_meta: Dict[str, Any]) -> Docu
     txt += f"Файл: {image_entry.get('Файл изображения') or image_entry.get('file','')}."
     md = dict(image_meta)
     md.update({'chunk_type': 'image', 'chunk_id': f"{image_meta.get('document_id')}_image_{image_entry.get('№ Изображения','0')}"})
     return Document(text=txt, metadata=md)
 def build_chunks_from_repo(repo_id: str) -> List[Document]:
     zip_paths = list_zip_files_in_repo(repo_id)
-    print(f"Found {len(zip_paths)} zip files under {JSON_FILES_DIR} in repo {repo_id}")
     splitter = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
     all_chunks = []
-    for remote_path in zip_paths:
-        print(f"Downloading {remote_path}...")
         local_zip = download_file_from_hf(repo_id, remote_path, DOWNLOAD_DIR)
-        print(f"Parsing {local_zip}...")
         json_docs = read_jsons_from_zip(local_zip)
-        for doc in json_docs:
             doc_meta = doc.get('document_metadata', {})
-            doc_id = doc_meta.get('document_id') or doc_meta.get('document_name') or 'unknown_doc'
             base_meta = {'document_id': doc_id, 'document_name': doc_meta.get('document_name','')}
-            for sec in doc.get('sections', []):
-                sec_meta = dict(base_meta)
-                sec_meta.update({'section_id': sec.get('section_id'), 'section_title': None})
-                text = sec.get('section_text') or sec.get('text') or ''
-                if text and text.strip():
-                    chunks = chunk_text_field(text, sec_meta, splitter)
-                    all_chunks.extend(chunks)
-            for sheet in doc.get('sheets', []) + doc.get('tables', []) if (doc.get('sheets') or doc.get('tables')) else []:
-                table_meta = dict(base_meta)
-                table_meta.update({
-                    'sheet_name': sheet.get('sheet_name') or sheet.get('table_title'),
-                    'section': sheet.get('section'),
-                    'table_number': sheet.get('table_number'),
-                    'table_title': sheet.get('table_title')
-                })
-                table_chunks = chunk_table(sheet, table_meta, max_rows=TABLE_MAX_ROWS_PER_CHUNK)
-                all_chunks.extend(table_chunks)
-            for img in doc.get('images', []) or doc.get('image_data', []) or doc.get('image_entries', []):
-                img_meta = dict(base_meta)
-                chunk = chunk_image(img, img_meta)
-                all_chunks.append(chunk)
-    print(f"Built total {len(all_chunks)} chunks")
     return all_chunks
 def create_hybrid_index(documents):
-    print("Creating vector index...")
     vector_index = VectorStoreIndex.from_documents(documents)
-    print("Creating keyword index...")
     keyword_index = KeywordTableIndex.from_documents(documents)
     return vector_index, keyword_index
 def create_fusion_retriever(vector_index, keyword_index, documents):
     vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
     bm25_retriever = BM25Retriever.from_defaults(
         docstore=vector_index.docstore,
         similarity_top_k=5
     )
     fusion_retriever = QueryFusionRetriever(
         [vector_retriever, bm25_retriever],
@@ -233,26 +355,32 @@ def create_fusion_retriever(vector_index, keyword_index, documents):
         mode="reciprocal_rerank",
         use_async=False
     )
     return fusion_retriever
 def create_query_engine(vector_index, keyword_index, documents):
     fusion_retriever = create_fusion_retriever(vector_index, keyword_index, documents)
     response_synthesizer = get_response_synthesizer(
         response_mode=ResponseMode.COMPACT,
         use_async=False
     )
     query_engine = RetrieverQueryEngine(
         retriever=fusion_retriever,
         response_synthesizer=response_synthesizer
     )
     return query_engine
 def initialize_system():
-    print("Initializing system...")
     embed_model = get_embedding_model()
     llm = get_llm_model(DEFAULT_MODEL)
@@ -261,17 +389,17 @@ def initialize_system():
     Settings.llm = llm
     Settings.chunk_size = CHUNK_SIZE
     Settings.chunk_overlap = CHUNK_OVERLAP
-    print("Loading documents...")
     documents = build_chunks_from_repo(HF_REPO_ID)
-    print("Creating indices...")
     vector_index, keyword_index = create_hybrid_index(documents)
-    print("Creating query engine...")
     query_engine = create_query_engine(vector_index, keyword_index, documents)
-    print("System initialized successfully!")
     return query_engine, vector_index, keyword_index, documents
 def answer_question(question, query_engine):
@@ -279,7 +407,9 @@ def answer_question(question, query_engine):
         return "<div style='color: black;'>Please enter a question</div>"
     try:
         response = query_engine.query(question)
         answer_html = f"""
         <div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; color: black;'>
@@ -303,18 +433,21 @@ def answer_question(question, query_engine):
         return answer_html, sources_html
     except Exception as e:
         error_html = f"<div style='color: red;'>Error: {str(e)}</div>"
         return error_html, error_html
 def switch_model(model_name, vector_index, keyword_index, documents):
     try:
-        print(f"Switching to model: {model_name}")
         new_llm = get_llm_model(model_name)
         Settings.llm = new_llm
         new_query_engine = create_query_engine(vector_index, keyword_index, documents)
         return new_query_engine, f"✅ Model switched to: {model_name}"
     except Exception as e:
         return None, f"❌ Error: {str(e)}"
 query_engine = None
@@ -394,12 +527,12 @@ def create_interface():
 def main():
     global query_engine, vector_index, keyword_index, documents
-    print("Starting AIEXP - AI Expert for Regulatory Documentation")
     query_engine, vector_index, keyword_index, documents = initialize_system()
     if query_engine:
-        print("Launching web interface...")
         demo = create_interface()
         demo.launch(
             server_name="0.0.0.0",
@@ -407,7 +540,7 @@ def main():
             share=True
         )
     else:
-        print("Failed to initialize system")
         sys.exit(1)
 if __name__ == "__main__":

 import os
 import json
 import zipfile
+import logging
 from typing import List, Dict, Any
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 import gradio as gr
 import sys
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('rag_system.log'),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
 os.makedirs(DOWNLOAD_DIR, exist_ok=True)
+stats = {
+    'total_documents': 0,
+    'total_text_chunks': 0,
+    'total_tables': 0,
+    'total_table_chunks': 0,
+    'total_images': 0,
+    'failed_files': 0,
+    'encoding_errors': []
+}
 def get_llm_model(model_name):
     try:
+        logger.info(f"Initializing LLM model: {model_name}")
         model_config = AVAILABLE_MODELS.get(model_name)
         if not model_config:
+            logger.warning(f"Model {model_name} not found, using default: {DEFAULT_MODEL}")
             model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
         if not model_config.get("api_key"):
+            raise Exception(f"API key not found for model {model_name}")
         if model_config["provider"] == "google":
+            llm = GoogleGenAI(
                 model=model_config["model_name"],
+                api_key=model_config["api_key"]
             )
+            logger.info(f"Successfully initialized Google model: {model_config['model_name']}")
+            return llm
         elif model_config["provider"] == "openai":
+            llm = OpenAI(
                 model=model_config["model_name"],
                 api_key=model_config["api_key"]
             )
+            logger.info(f"Successfully initialized OpenAI model: {model_config['model_name']}")
+            return llm
         else:
+            raise Exception(f"Unsupported provider: {model_config['provider']}")
     except Exception as e:
+        logger.error(f"Error initializing model {model_name}: {e}")
+        logger.info("Falling back to default Gemini model")
         return GoogleGenAI(
             model="gemini-2.0-flash",
             api_key=GOOGLE_API_KEY
         )
 def get_embedding_model():
+    logger.info("Initializing embedding model: all-MiniLM-L6-v2")
     from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+    embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    logger.info("Embedding model initialized successfully")
+    return embed_model
 def list_zip_files_in_repo(repo_id: str) -> List[str]:
+    logger.info(f"Listing files in repository: {repo_id}")
+    files = list_repo_files(repo_id, repo_type="dataset", token=HF_TOKEN)
+    zip_files = [f for f in files if f.startswith(JSON_FILES_DIR) and f.endswith('.zip')]
+    logger.info(f"Found {len(zip_files)} zip files in {JSON_FILES_DIR} directory")
+    return zip_files
 def download_file_from_hf(repo_id: str, path_in_repo: str, dest_dir: str) -> str:
+    logger.info(f"Downloading file: {path_in_repo}")
     local_path = hf_hub_download(
         repo_id=repo_id,
         filename=path_in_repo,
         repo_type="dataset",
         token=HF_TOKEN,
+        local_dir=dest_dir
     )
+    logger.info(f"File downloaded to: {local_path}")
+    return local_path
 def read_jsons_from_zip(zip_path: str) -> List[Dict[str, Any]]:
+    logger.info(f"Reading JSON files from zip: {zip_path}")
     docs = []
+    json_count = 0
+    failed_count = 0
     with zipfile.ZipFile(zip_path, 'r') as z:
+        json_files = [name for name in z.namelist() if name.lower().endswith('.json')]
+        logger.info(f"Found {len(json_files)} JSON files in zip")
+        for name in json_files:
+            try:
                 with z.open(name) as f:
+                    raw_bytes = f.read()
+                    for encoding in ['utf-8', 'utf-8-sig', 'latin-1', 'cp1251', 'windows-1251']:
+                        try:
+                            text = raw_bytes.decode(encoding)
+                            data = json.loads(text)
+                            docs.append(data)
+                            json_count += 1
+                            logger.debug(f"Successfully loaded {name} with encoding {encoding}")
+                            break
+                        except (UnicodeDecodeError, json.JSONDecodeError):
+                            continue
+                    else:
+                        failed_count += 1
+                        stats['failed_files'] += 1
+                        stats['encoding_errors'].append(name)
+                        logger.warning(f"Failed to load {name} - tried all encodings")
+            except Exception as e:
+                failed_count += 1
+                stats['failed_files'] += 1
+                logger.error(f"Error processing {name}: {e}")
+    logger.info(f"Successfully loaded {json_count} JSON files, failed: {failed_count}")
     return docs
 def chunk_text_field(text: str, doc_meta: Dict[str, Any], splitter: SentenceSplitter) -> List[Document]:
     nodes = splitter.split_text(text)
     chunks = []
     for i, node_text in enumerate(nodes):
         md = dict(doc_meta)
         md.update({
             'chunk_type': 'text'
         })
         chunks.append(Document(text=node_text, metadata=md))
+    stats['total_text_chunks'] += len(chunks)
+    logger.debug(f"Created {len(chunks)} text chunks for document {doc_meta.get('document_id')}")
     return chunks
 def chunk_table(table: Dict[str, Any], table_meta: Dict[str, Any], max_rows: int = TABLE_MAX_ROWS_PER_CHUNK) -> List[Document]:
     headers = table.get('headers') or []
     rows = table.get('data') or []
+    stats['total_tables'] += 1
     if not rows:
         text = table.get('table_description') or table.get('table_title') or ''
         md = {**table_meta, 'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_single"}
+        stats['total_table_chunks'] += 1
+        logger.debug(f"Created single chunk for empty table: {table_meta.get('table_title')}")
         return [Document(text=text, metadata=md)]
     chunks = []
+    num_chunks = (len(rows) + max_rows - 1) // max_rows
     for i in range(0, len(rows), max_rows):
         block = rows[i:i+max_rows]
         lines = []
         md = dict(table_meta)
         md.update({'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_{i // max_rows}"})
         chunks.append(Document(text=chunk_text, metadata=md))
+    stats['total_table_chunks'] += len(chunks)
+    logger.debug(f"Table '{table_meta.get('table_title')}': {len(rows)} rows split into {len(chunks)} chunks")
     return chunks
 def chunk_image(image_entry: Dict[str, Any], image_meta: Dict[str, Any]) -> Document:
     txt += f"Файл: {image_entry.get('Файл изображения') or image_entry.get('file','')}."
     md = dict(image_meta)
     md.update({'chunk_type': 'image', 'chunk_id': f"{image_meta.get('document_id')}_image_{image_entry.get('№ Изображения','0')}"})
+    stats['total_images'] += 1
+    logger.debug(f"Created image chunk: {image_entry.get('Название изображения', 'unknown')}")
     return Document(text=txt, metadata=md)
 def build_chunks_from_repo(repo_id: str) -> List[Document]:
+    logger.info("=" * 80)
+    logger.info("Starting document processing from repository")
+    logger.info("=" * 80)
     zip_paths = list_zip_files_in_repo(repo_id)
+    logger.info(f"Total zip files to process: {len(zip_paths)}")
     splitter = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+    logger.info(f"Text splitter configured: chunk_size={CHUNK_SIZE}, chunk_overlap={CHUNK_OVERLAP}")
     all_chunks = []
+    for zip_idx, remote_path in enumerate(zip_paths, 1):
+        logger.info(f"\n[{zip_idx}/{len(zip_paths)}] Processing zip file: {remote_path}")
         local_zip = download_file_from_hf(repo_id, remote_path, DOWNLOAD_DIR)
         json_docs = read_jsons_from_zip(local_zip)
+        logger.info(f"Processing {len(json_docs)} documents from {remote_path}")
+        stats['total_documents'] += len(json_docs)
+        for doc_idx, doc in enumerate(json_docs, 1):
             doc_meta = doc.get('document_metadata', {})
+            doc_id = doc_meta.get('document_id') or doc_meta.get('document_name') or f'unknown_doc_{doc_idx}'
             base_meta = {'document_id': doc_id, 'document_name': doc_meta.get('document_name','')}
+            logger.info(f"  Document [{doc_idx}/{len(json_docs)}]: {doc_id}")
+            sections = doc.get('sections', [])
+            if sections:
+                logger.info(f"    Processing {len(sections)} text sections")
+                for sec in sections:
+                    sec_meta = dict(base_meta)
+                    sec_meta.update({'section_id': sec.get('section_id'), 'section_title': None})
+                    text = sec.get('section_text') or sec.get('text') or ''
+                    if text and text.strip():
+                        chunks = chunk_text_field(text, sec_meta, splitter)
+                        all_chunks.extend(chunks)
+            tables = doc.get('sheets', []) + doc.get('tables', []) if (doc.get('sheets') or doc.get('tables')) else []
+            if tables:
+                logger.info(f"    Processing {len(tables)} tables")
+                for tbl_idx, sheet in enumerate(tables, 1):
+                    table_meta = dict(base_meta)
+                    table_meta.update({
+                        'sheet_name': sheet.get('sheet_name') or sheet.get('table_title'),
+                        'section': sheet.get('section'),
+                        'table_number': sheet.get('table_number'),
+                        'table_title': sheet.get('table_title')
+                    })
+                    table_chunks = chunk_table(sheet, table_meta, max_rows=TABLE_MAX_ROWS_PER_CHUNK)
+                    all_chunks.extend(table_chunks)
+            images = doc.get('images', []) or doc.get('image_data', []) or doc.get('image_entries', [])
+            if images:
+                logger.info(f"    Processing {len(images)} images")
+                for img in images:
+                    img_meta = dict(base_meta)
+                    chunk = chunk_image(img, img_meta)
+                    all_chunks.append(chunk)
+    logger.info("\n" + "=" * 80)
+    logger.info("PROCESSING SUMMARY")
+    logger.info("=" * 80)
+    logger.info(f"Total documents processed: {stats['total_documents']}")
+    logger.info(f"Total text chunks created: {stats['total_text_chunks']}")
+    logger.info(f"Total tables processed: {stats['total_tables']}")
+    logger.info(f"Total table chunks created: {stats['total_table_chunks']}")
+    logger.info(f"Total images processed: {stats['total_images']}")
+    logger.info(f"Total chunks created: {len(all_chunks)}")
+    logger.info(f"Failed files: {stats['failed_files']}")
+    if stats['encoding_errors']:
+        logger.warning(f"Files with encoding errors ({len(stats['encoding_errors'])}):")
+        for err_file in stats['encoding_errors'][:10]:
+            logger.warning(f"  - {err_file}")
+        if len(stats['encoding_errors']) > 10:
+            logger.warning(f"  ... and {len(stats['encoding_errors']) - 10} more")
+    logger.info("=" * 80)
     return all_chunks
 def create_hybrid_index(documents):
+    logger.info("Creating hybrid index system")
+    logger.info(f"Building vector index from {len(documents)} documents")
     vector_index = VectorStoreIndex.from_documents(documents)
+    logger.info("Vector index created successfully")
+    logger.info("Building keyword index")
     keyword_index = KeywordTableIndex.from_documents(documents)
+    logger.info("Keyword index created successfully")
     return vector_index, keyword_index
 def create_fusion_retriever(vector_index, keyword_index, documents):
+    logger.info("Creating fusion retriever with multiple retrieval strategies")
     vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
+    logger.info("Vector retriever configured (top_k=5)")
     bm25_retriever = BM25Retriever.from_defaults(
         docstore=vector_index.docstore,
         similarity_top_k=5
     )
+    logger.info("BM25 retriever configured (top_k=5)")
     fusion_retriever = QueryFusionRetriever(
         [vector_retriever, bm25_retriever],
         mode="reciprocal_rerank",
         use_async=False
     )
+    logger.info("Fusion retriever created with reciprocal rerank mode")
     return fusion_retriever
 def create_query_engine(vector_index, keyword_index, documents):
+    logger.info("Creating query engine")
     fusion_retriever = create_fusion_retriever(vector_index, keyword_index, documents)
     response_synthesizer = get_response_synthesizer(
         response_mode=ResponseMode.COMPACT,
         use_async=False
     )
+    logger.info("Response synthesizer configured (COMPACT mode)")
     query_engine = RetrieverQueryEngine(
         retriever=fusion_retriever,
         response_synthesizer=response_synthesizer
     )
+    logger.info("Query engine created successfully")
     return query_engine
 def initialize_system():
+    logger.info("\n" + "=" * 80)
+    logger.info("INITIALIZING AIEXP RAG SYSTEM")
+    logger.info("=" * 80)
     embed_model = get_embedding_model()
     llm = get_llm_model(DEFAULT_MODEL)
     Settings.llm = llm
     Settings.chunk_size = CHUNK_SIZE
     Settings.chunk_overlap = CHUNK_OVERLAP
+    logger.info("Global settings configured")
     documents = build_chunks_from_repo(HF_REPO_ID)
     vector_index, keyword_index = create_hybrid_index(documents)
     query_engine = create_query_engine(vector_index, keyword_index, documents)
+    logger.info("=" * 80)
+    logger.info("SYSTEM INITIALIZATION COMPLETE")
+    logger.info("=" * 80)
     return query_engine, vector_index, keyword_index, documents
 def answer_question(question, query_engine):
         return "<div style='color: black;'>Please enter a question</div>"
     try:
+        logger.info(f"Processing query: {question[:100]}...")
         response = query_engine.query(question)
+        logger.info(f"Query processed, found {len(response.source_nodes)} source nodes")
         answer_html = f"""
         <div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; color: black;'>
         return answer_html, sources_html
     except Exception as e:
+        logger.error(f"Error processing query: {e}", exc_info=True)
         error_html = f"<div style='color: red;'>Error: {str(e)}</div>"
         return error_html, error_html
 def switch_model(model_name, vector_index, keyword_index, documents):
     try:
+        logger.info(f"Switching to model: {model_name}")
         new_llm = get_llm_model(model_name)
         Settings.llm = new_llm
         new_query_engine = create_query_engine(vector_index, keyword_index, documents)
+        logger.info(f"Successfully switched to model: {model_name}")
         return new_query_engine, f"✅ Model switched to: {model_name}"
     except Exception as e:
+        logger.error(f"Error switching model: {e}")
         return None, f"❌ Error: {str(e)}"
 query_engine = None
 def main():
     global query_engine, vector_index, keyword_index, documents
+    logger.info("Starting AIEXP - AI Expert for Regulatory Documentation")
     query_engine, vector_index, keyword_index, documents = initialize_system()
     if query_engine:
+        logger.info("Launching web interface on port 7860")
         demo = create_interface()
         demo.launch(
             server_name="0.0.0.0",
             share=True
         )
     else:
+        logger.error("Failed to initialize system")
         sys.exit(1)
 if __name__ == "__main__":