Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Aug 15, 2025

Commit

fb442dd

1 Parent(s): 8fefe0f

new app py + init data

Browse files

Files changed (2) hide show

app.py +14 -46
document_processor.py +66 -47

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from llama_index.core import Settings
 from config import *
 from document_processor import *
 from llama_index.core.chat_engine import CondensePlusContextChatEngine
 #new thing
 query_engine = None
@@ -22,7 +23,6 @@ def answer_question(question, history):
     try:
         start_time = time.time()
-        # Initialize chat engine if not exists
         if chat_engine is None:
             chat_engine = CondensePlusContextChatEngine.from_defaults(
                 retriever=query_engine.retriever,
@@ -45,17 +45,15 @@ def answer_question(question, history):
         </div>
         </div>"""
-        # Update chat history (keep last 6 messages - 3 exchanges)
-        new_history = history + [[question, response.response]]
-        if len(new_history) > 3:
-            new_history = new_history[-3:]
         return answer_with_time, sources_html, new_history
     except Exception as e:
         error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>❌ Error processing question: {str(e)}</div>"
         return error_msg, "", history
 def generate_sources_html(nodes):
@@ -164,7 +162,8 @@ def create_interface():
                     chatbot = gr.Chatbot(
                         label="Chat History",
                         height=400,
-                        show_label=True
                     )
                     question_input = gr.Textbox(
@@ -227,39 +226,35 @@ def create_interface():
                     refresh_btn = gr.Button("🔄 Refresh List", variant="secondary")
-                with gr.Column(scale=1, elem_id="upload-column"):
-                    gr.Markdown("#### Upload new documents", elem_classes=["upload-header"])
-                    gr.Markdown("Supported formats: PDF, TXT", elem_classes=["upload-info"])
                     file_upload = gr.File(
                         file_count="multiple",
                         file_types=[".pdf", ".txt"],
-                        label="Select files to upload",
-                        elem_classes=["upload-file"]
                     )
                     doc_names_input = gr.Textbox(
                         label="Document names (one per line)",
                         placeholder="Enter document names, one per line...",
-                        lines=5,
-                        elem_classes=["upload-input"]
                     )
                     doc_links_input = gr.Textbox(
                         label="Document links (one per line)",
                         placeholder="Enter document links, one per line...",
-                        lines=5,
-                        elem_classes=["upload-input"]
                     )
-                    upload_btn = gr.Button("📤 Upload and Process", variant="primary", elem_classes=["upload-btn"])
                     upload_status = gr.Textbox(
                         label="Upload status",
                         lines=8,
                         max_lines=10,
-                        interactive=False,
-                        elem_classes=["upload-status"]
                     )
             def process_names_and_links(names_text, links_text):
@@ -281,35 +276,8 @@ def create_interface():
                 outputs=[documents_display]
             )
-    # Add CSS to fix white background in upload tab
-    demo.css = """
-    #upload-column {
-        background-color: #f8f9fa !important;
-        padding: 20px !important;
-        border-radius: 10px !important;
-        border: 1px solid #e9ecef !important;
-    }
-    .upload-header h4 {
-        color: #2d3748 !important;
-        margin-bottom: 10px !important;
-    }
-    .upload-info {
-        color: #666 !important;
-        margin-bottom: 15px !important;
-    }
-    .upload-file, .upload-input, .upload-status {
-        background-color: white !important;
-        border: 1px solid #ced4da !important;
-        border-radius: 5px !important;
-    }
-    .upload-btn {
-        margin-top: 10px !important;
-    }
-    """
     return demo
 if __name__ == "__main__":
     try:
         log_message("🚀 Starting AIEXP - AI Expert for Regulatory Documentation")

 from config import *
 from document_processor import *
 from llama_index.core.chat_engine import CondensePlusContextChatEngine
+import faiss
 #new thing
 query_engine = None
     try:
         start_time = time.time()
         if chat_engine is None:
             chat_engine = CondensePlusContextChatEngine.from_defaults(
                 retriever=query_engine.retriever,
         </div>
         </div>"""
+        new_history = history + [{"role": "user", "content": question}, {"role": "assistant", "content": response.response}]
+        if len(new_history) > 6:
+            new_history = new_history[-6:]
         return answer_with_time, sources_html, new_history
     except Exception as e:
         error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>❌ Error processing question: {str(e)}</div>"
         return error_msg, "", history
 def generate_sources_html(nodes):
                     chatbot = gr.Chatbot(
                         label="Chat History",
                         height=400,
+                        show_label=True,
+                        type="messages"
                     )
                     question_input = gr.Textbox(
                     refresh_btn = gr.Button("🔄 Refresh List", variant="secondary")
+                with gr.Column(scale=1):
+                    gr.Markdown("#### Upload new documents")
+                    gr.Markdown("Supported formats: PDF, TXT")
                     file_upload = gr.File(
                         file_count="multiple",
                         file_types=[".pdf", ".txt"],
+                        label="Select files to upload"
                     )
                     doc_names_input = gr.Textbox(
                         label="Document names (one per line)",
                         placeholder="Enter document names, one per line...",
+                        lines=5
                     )
                     doc_links_input = gr.Textbox(
                         label="Document links (one per line)",
                         placeholder="Enter document links, one per line...",
+                        lines=5
                     )
+                    upload_btn = gr.Button("📤 Upload and Process", variant="primary")
                     upload_status = gr.Textbox(
                         label="Upload status",
                         lines=8,
                         max_lines=10,
+                        interactive=False
                     )
             def process_names_and_links(names_text, links_text):
                 outputs=[documents_display]
             )
     return demo
 if __name__ == "__main__":
     try:
         log_message("🚀 Starting AIEXP - AI Expert for Regulatory Documentation")

document_processor.py CHANGED Viewed

@@ -10,6 +10,9 @@ from llama_index.core.response_synthesizers import get_response_synthesizer, Res
 from llama_index.core.prompts import PromptTemplate
 from config import *
 import shutil
 def log_message(message):
     print(message, flush=True)
@@ -184,61 +187,77 @@ def add_to_vector_index(new_chunks, file_info, existing_chunks_df=None):
         return None, existing_chunks_df, str(e)
 def initialize_system():
     try:
-        log_message("🔄 Initializing AI Expert system...")
         os.makedirs(download_dir, exist_ok=True)
-        chunks_df = None
-        try:
-            chunks_csv_path = os.path.join(download_dir, chunks_filename)
-            if os.path.exists(chunks_csv_path):
-                log_message("📂 Loading existing chunks...")
-                chunks_df = pd.read_csv(chunks_csv_path)
-            else:
-                log_message("📝 Creating empty chunks database...")
-                chunks_df = pd.DataFrame(columns=['chunk_id', 'document_id', 'document_name', 'document_link', 'chunk_text'])
-        except Exception as e:
-            log_message(f"⚠️ Could not load chunks: {str(e)}")
-            chunks_df = pd.DataFrame(columns=['chunk_id', 'document_id', 'document_name', 'document_link', 'chunk_text'])
-        query_engine = None
-        if not chunks_df.empty:
-            log_message("🤖 Setting up AI models...")
-            embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
-            log_message("📝 Creating document objects from existing chunks...")
-            documents = [Document(text=str(row['chunk_text']),
-                               metadata={
-                                   "chunk_id": row.get('chunk_id', i),
                                    "document_id": row.get('document_id', 'unknown'),
                                    "document_name": row.get('document_name', 'unknown'),
-                                   "document_link": row.get('document_link', '')
-                               })
-                        for i, (_, row) in enumerate(chunks_df.iterrows())]
-            log_message("🔍 Building vector index...")
-            vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
-            retriever = VectorIndexRetriever(
-                index=vector_index,
-                similarity_top_k=RETRIEVER_TOP_K,
-                similarity_cutoff=SIMILARITY_THRESHOLD
-            )
-            custom_prompt_template = PromptTemplate(CUSTOM_PROMPT)
-            response_synthesizer = get_response_synthesizer(
-                response_mode=ResponseMode.TREE_SUMMARIZE,
-                text_qa_template=custom_prompt_template
-            )
-            query_engine = RetrieverQueryEngine(
-                retriever=retriever,
-                response_synthesizer=response_synthesizer
-            )
         log_message("✅ System successfully initialized!")
         return query_engine, chunks_df, True
     except Exception as e:
         log_message(f"❌ Initialization error: {str(e)}")
-        return None, None, False

 from llama_index.core.prompts import PromptTemplate
 from config import *
 import shutil
+import faiss
+from huggingface_hub import hf_hub_download
 def log_message(message):
     print(message, flush=True)
         return None, existing_chunks_df, str(e)
 def initialize_system():
+    global query_engine, chunks_df
     try:
+        log_message("🔄 Initializing system...")
         os.makedirs(download_dir, exist_ok=True)
+        log_message("📥 Loading files...")
+        faiss_index_path = hf_hub_download(
+            repo_id=REPO_ID,
+            filename=faiss_index_filename,
+            local_dir=download_dir,
+            repo_type="dataset",
+            token=HF_TOKEN
+        )
+        chunks_csv_path = hf_hub_download(
+            repo_id=REPO_ID,
+            filename=chunks_filename,
+            local_dir=download_dir,
+            repo_type="dataset",
+            token=HF_TOKEN
+        )
+        log_message("📚 Loading index and data...")
+        index_faiss = faiss.read_index(faiss_index_path)
+        chunks_df = pd.read_csv(chunks_csv_path)
+        log_message("🤖 Setting up models...")
+        embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
+        text_column = None
+        for col in chunks_df.columns:
+            if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
+                text_column = col
+                break
+        if text_column is None:
+            text_column = chunks_df.columns[0]
+        log_message("📝 Creating documents...")
+        documents = [Document(text=str(row[text_column]),
+                           metadata={"chunk_id": row.get('chunk_id', i),
                                    "document_id": row.get('document_id', 'unknown'),
                                    "document_name": row.get('document_name', 'unknown'),
+                                   "document_link": row.get('document_link', '')})
+                    for i, (_, row) in enumerate(chunks_df.iterrows())]
+        log_message("🔍 Building vector index...")
+        vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
+        retriever = VectorIndexRetriever(
+            index=vector_index,
+            similarity_top_k=RETRIEVER_TOP_K,
+            similarity_cutoff=SIMILARITY_THRESHOLD
+        )
+        custom_prompt_template = PromptTemplate(CUSTOM_PROMPT)
+        response_synthesizer = get_response_synthesizer(
+            response_mode=ResponseMode.TREE_SUMMARIZE,
+            text_qa_template=custom_prompt_template
+        )
+        query_engine = RetrieverQueryEngine(
+            retriever=retriever,
+            response_synthesizer=response_synthesizer
+        )
         log_message("✅ System successfully initialized!")
         return query_engine, chunks_df, True
     except Exception as e:
         log_message(f"❌ Initialization error: {str(e)}")
+        chunks_df = pd.DataFrame(columns=['chunk_id', 'document_id', 'document_name', 'document_link', 'chunk_text'])
+        return None, chunks_df, False