Spaces:

PriyaMishra
/

RAGpdf_chatbot

Runtime error

App Files Files Community

PriyaMishra commited on Apr 15, 2025

Commit

dbe6595

verified ·

1 Parent(s): 5278dab

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -17

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 api_token = os.getenv("HF_TOKEN")
 from langchain_community.vectorstores import FAISS
-from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains import ConversationalRetrievalChain
 from langchain_community.embeddings import HuggingFaceEmbeddings
@@ -35,17 +35,13 @@ Chat History: {chat_history}
 Craft the response as a seamless, thorough, and authoritative explanation that naturally integrates all aspects of the query.
 """
-# Load and split documents
 def load_doc(list_file_path):
     pages = []
     for file_path in list_file_path:
-        if file_path.endswith('.pdf'):
-            loader = PyPDFLoader(file_path)
-        elif file_path.endswith('.txt'):
             loader = TextLoader(file_path)
-        else:
-            continue
-        pages.extend(loader.load())
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1024,
@@ -99,7 +95,7 @@ def initialize_database(list_file_obj, progress=gr.Progress()):
     list_file_path = [x.name for x in list_file_obj if x is not None]
     doc_splits = load_doc(list_file_path)
     vector_db = create_db(doc_splits)
-    return vector_db, "Database created!"
 # Initialize LLM
 def initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
@@ -126,7 +122,7 @@ def conversation(qa_chain, message, history):
     for i in range(3):
         if i < len(response_sources):
             sources_content.append(response_sources[i].page_content.strip())
-            sources_pages.append(response_sources[i].metadata.get("page", 0) + 1)
         else:
             sources_content.append("")
             sources_pages.append(0)
@@ -141,20 +137,20 @@ def demo():
     with gr.Blocks(theme=gr.themes.Default(primary_hue="red", secondary_hue="pink", neutral_hue="sky")) as demo:
         vector_db = gr.State()
         qa_chain = gr.State()
-        gr.HTML("<center><h1>RAG Document Chatbot</h1><center>")
-        gr.Markdown("""<b>Query your documents!</b> This AI agent performs retrieval augmented generation (RAG) on PDF and TXT documents.
         <b>Please do not upload confidential documents.</b>
         """)
         with gr.Row():
             with gr.Column(scale=86):
-                gr.Markdown("<b>Step 1 - Upload Documents and Initialize RAG pipeline</b>")
                 with gr.Row():
                     document = gr.Files(height=300, file_count="multiple",
-                                      file_types=["pdf", "txt"], interactive=True,
-                                      label="Upload PDF or TXT documents")
                 with gr.Row():
-                    db_btn = gr.Button("Create vector database")
                 with gr.Row():
                     db_progress = gr.Textbox(value="Not initialized", show_label=False)
@@ -186,7 +182,7 @@ def demo():
                         with gr.Row():
                             doc_source = gr.Textbox(label=f"Reference {i}", lines=2,
                                                   container=True, scale=20)
-                            source_page = gr.Number(label="Page", scale=1)
                 with gr.Row():
                     msg = gr.Textbox(placeholder="Ask a question", container=True)
                 with gr.Row():

 api_token = os.getenv("HF_TOKEN")
 from langchain_community.vectorstores import FAISS
+from langchain_community.document_loaders import TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains import ConversationalRetrievalChain
 from langchain_community.embeddings import HuggingFaceEmbeddings
 Craft the response as a seamless, thorough, and authoritative explanation that naturally integrates all aspects of the query.
 """
+# Load and split text documents
 def load_doc(list_file_path):
     pages = []
     for file_path in list_file_path:
+        if file_path.endswith('.txt'):
             loader = TextLoader(file_path)
+            pages.extend(loader.load())
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1024,
     list_file_path = [x.name for x in list_file_obj if x is not None]
     doc_splits = load_doc(list_file_path)
     vector_db = create_db(doc_splits)
+    return vector_db, "Text database created!"
 # Initialize LLM
 def initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
     for i in range(3):
         if i < len(response_sources):
             sources_content.append(response_sources[i].page_content.strip())
+            sources_pages.append(0)  # For text files, we don't have page numbers
         else:
             sources_content.append("")
             sources_pages.append(0)
     with gr.Blocks(theme=gr.themes.Default(primary_hue="red", secondary_hue="pink", neutral_hue="sky")) as demo:
         vector_db = gr.State()
         qa_chain = gr.State()
+        gr.HTML("<center><h1>RAG Text Document Chatbot</h1><center>")
+        gr.Markdown("""<b>Query your text documents!</b> This AI agent performs retrieval augmented generation (RAG) on TXT documents.
         <b>Please do not upload confidential documents.</b>
         """)
         with gr.Row():
             with gr.Column(scale=86):
+                gr.Markdown("<b>Step 1 - Upload Text Files and Initialize RAG pipeline</b>")
                 with gr.Row():
                     document = gr.Files(height=300, file_count="multiple",
+                                      file_types=["txt"], interactive=True,
+                                      label="Upload TXT documents")
                 with gr.Row():
+                    db_btn = gr.Button("Create text database")
                 with gr.Row():
                     db_progress = gr.Textbox(value="Not initialized", show_label=False)
                         with gr.Row():
                             doc_source = gr.Textbox(label=f"Reference {i}", lines=2,
                                                   container=True, scale=20)
+                            source_page = gr.Number(label="Line Range", scale=1, visible=False)
                 with gr.Row():
                     msg = gr.Textbox(placeholder="Ask a question", container=True)
                 with gr.Row():