Spaces:

aach456
/

DocAI-chatbot

Sleeping

App Files Files Community

aach456 commited on Apr 18, 2025

Commit

0a8bbc5

verified ·

1 Parent(s): f02d10f

Create app.py

Browse files

Files changed (1) hide show

app.py +353 -0

app.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import os
+import tempfile
+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader, UnstructuredPowerPointLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.chains import ConversationalRetrievalChain
+from langchain_community.llms import HuggingFacePipeline
+# Configure environment
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+LLM_MODEL = "google/flan-t5-large"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+THRESHOLD = 0.7  # Relevance threshold for retrieval
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 200
+TEMPERATURE = 0.1
+MAX_NEW_TOKENS = 512
+TOP_K = 3  # Number of chunks to retrieve
+# Store for conversation history
+conversation_history = {}
+current_session_id = None
+current_document_store = None
+current_document_name = None
+FILE_EXTENSIONS = {
+    ".pdf": PyPDFLoader,
+    ".txt": TextLoader,
+    ".docx": Docx2txtLoader,
+    ".pptx": UnstructuredPowerPointLoader,
+}
+class DocumentAIBot:
+    def __init__(self):
+        self.setup_models()
+    def setup_models(self):
+        print("Setting up models...")
+        # Set up embedding model
+        self.embedding_model = HuggingFaceEmbeddings(
+            model_name=EMBEDDING_MODEL,
+            model_kwargs={"device": DEVICE},
+            encode_kwargs={"normalize_embeddings": True}
+        )
+        # Set up LLM model
+        self.tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
+        self.llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL).to(DEVICE)
+        # Create text generation pipeline
+        self.text_generation_pipeline = pipeline(
+            "text2text-generation",
+            model=self.llm_model,
+            tokenizer=self.tokenizer,
+            max_new_tokens=MAX_NEW_TOKENS,
+            temperature=TEMPERATURE,
+            device=0 if DEVICE == "cuda" else -1
+        )
+        # Create HuggingFace pipeline for LangChain
+        self.llm = HuggingFacePipeline(pipeline=self.text_generation_pipeline)
+        # Text splitter for document chunking
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=CHUNK_SIZE,
+            chunk_overlap=CHUNK_OVERLAP,
+            length_function=len
+        )
+        print("Models loaded successfully!")
+    def process_document(self, file_path):
+        """Process a document and create a vector store."""
+        print(f"Processing document: {file_path}")
+        file_extension = os.path.splitext(file_path)[1].lower()
+        if file_extension not in FILE_EXTENSIONS:
+            raise ValueError(f"Unsupported file format: {file_extension}")
+        # Select appropriate loader
+        loader_class = FILE_EXTENSIONS[file_extension]
+        loader = loader_class(file_path)
+        # Load and split the document
+        documents = loader.load()
+        chunks = self.text_splitter.split_documents(documents)
+        if not chunks:
+            raise ValueError("No content extracted from the document")
+        print(f"Document split into {len(chunks)} chunks")
+        # Create vector store
+        vector_store = FAISS.from_documents(chunks, self.embedding_model)
+        return vector_store
+    def setup_retrieval_chain(self, vector_store):
+        """Set up the retrieval chain with the vector store."""
+        retriever = vector_store.as_retriever(
+            search_type="similarity_score_threshold",
+            search_kwargs={
+                "k": TOP_K,
+                "score_threshold": THRESHOLD
+            }
+        )
+        chain = ConversationalRetrievalChain.from_llm(
+            llm=self.llm,
+            retriever=retriever,
+            return_source_documents=True,
+            verbose=True
+        )
+        return chain
+    def get_answer(self, question, session_id, vector_store, chat_history):
+        """Get answer for a question using the retrieval chain."""
+        if not question.strip():
+            return "Please enter a question related to the document.", chat_history
+        # Setup retrieval chain if needed
+        retrieval_chain = self.setup_retrieval_chain(vector_store)
+        # Format chat history for the model
+        formatted_chat_history = [(q, a) for q, a in chat_history]
+        # Get response from the chain
+        response = retrieval_chain(
+            {"question": question, "chat_history": formatted_chat_history}
+        )
+        answer = response["answer"]
+        source_documents = response.get("source_documents", [])
+        # Format answer with source information
+        if source_documents:
+            source_info = "\n\nSources:"
+            seen_sources = set()
+            for doc in source_documents:
+                source = doc.metadata.get("source", "Unknown source")
+                page = doc.metadata.get("page", "Unknown page")
+                source_key = f"{source}-{page}"
+                if source_key not in seen_sources:
+                    seen_sources.add(source_key)
+                    if source == "Unknown source":
+                        source_info += f"\n- Document chunk (page {page})"
+                    else:
+                        source_info += f"\n- {os.path.basename(source)} (page {page})"
+            answer += source_info
+        return answer, chat_history + [(question, answer)]
+def generate_session_id():
+    """Generate a unique session ID."""
+    import uuid
+    return str(uuid.uuid4())
+def save_uploaded_file(file):
+    """Save uploaded file to a temporary location and return the path."""
+    temp_dir = tempfile.gettempdir()
+    temp_path = os.path.join(temp_dir, file.name)
+    with open(temp_path, "wb") as f:
+        f.write(file.read())
+    return temp_path
+def clear_conversation():
+    """Clear the conversation history for the current session."""
+    global conversation_history, current_session_id
+    if current_session_id and current_session_id in conversation_history:
+        conversation_history[current_session_id] = []
+    return [], f"Conversation cleared. You can continue asking questions about '{current_document_name}'."
+def process_uploaded_document(file):
+    """Process an uploaded document and set up the session."""
+    global current_session_id, current_document_store, current_document_name, conversation_history
+    try:
+        if file is None:
+            return None, "Please upload a document first."
+        # Save the uploaded file
+        file_path = save_uploaded_file(file)
+        # Create document AI bot if not already created
+        if not hasattr(process_uploaded_document, "bot"):
+            process_uploaded_document.bot = DocumentAIBot()
+        # Process the document
+        vector_store = process_uploaded_document.bot.process_document(file_path)
+        # Create a new session
+        session_id = generate_session_id()
+        conversation_history[session_id] = []
+        # Update global variables
+        current_session_id = session_id
+        current_document_store = vector_store
+        current_document_name = file.name
+        return [], f"Document '{file.name}' processed successfully. You can now ask questions about it."
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, f"Error processing document: {str(e)}"
+def answer_question(question, history):
+    """Answer a question about the current document."""
+    global current_session_id, current_document_store, conversation_history
+    if not current_document_store:
+        return "Please upload a document first."
+    if not hasattr(process_uploaded_document, "bot"):
+        return "Document AI bot not initialized. Please reload the page and try again."
+    try:
+        # Get current chat history
+        chat_history = conversation_history.get(current_session_id, [])
+        # Get answer
+        answer, updated_history = process_uploaded_document.bot.get_answer(
+            question,
+            current_session_id,
+            current_document_store,
+            chat_history
+        )
+        # Update conversation history
+        conversation_history[current_session_id] = updated_history
+        return answer
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return f"Error generating answer: {str(e)}"
+def build_interface():
+    """Build and launch the Gradio interface."""
+    # Define the Gradio blocks
+    with gr.Blocks(title="Document AI Chatbot") as interface:
+        gr.Markdown("# 📄 Document AI Chatbot")
+        gr.Markdown("Upload a document (PDF, TXT, DOCX, PPTX) and ask questions about its content.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Document upload and processing section
+                file_input = gr.File(
+                    label="Upload Document",
+                    file_types=[".pdf", ".txt", ".docx", ".pptx"],
+                    type="file"
+                )
+                upload_button = gr.Button("Process Document", variant="primary")
+                upload_status = gr.Textbox(label="Upload Status", interactive=False)
+                clear_button = gr.Button("Clear Conversation")
+                gr.Markdown("### System Information")
+                system_info = gr.Markdown(f"""
+                - Embedding Model: {EMBEDDING_MODEL}
+                - Language Model: {LLM_MODEL}
+                - Running on: {DEVICE}
+                - Chunk Size: {CHUNK_SIZE}
+                - Relevance Threshold: {THRESHOLD}
+                """)
+            with gr.Column(scale=2):
+                # Chat interface
+                chatbot = gr.Chatbot(
+                    label="Conversation",
+                    height=500,
+                    show_label=True,
+                )
+                with gr.Row():
+                    question_input = gr.Textbox(
+                        label="Ask a question about the document",
+                        placeholder="What is the main topic of this document?",
+                        lines=2,
+                        max_lines=5,
+                        interactive=True,
+                        show_label=True
+                    )
+                    submit_button = gr.Button("Submit", variant="primary")
+        # Set up event handlers
+        upload_button.click(
+            process_uploaded_document,
+            inputs=[file_input],
+            outputs=[chatbot, upload_status]
+        )
+        submit_button.click(
+            answer_question,
+            inputs=[question_input, chatbot],
+            outputs=chatbot
+        ).then(
+            lambda: "",
+            None,
+            question_input
+        )
+        question_input.submit(
+            answer_question,
+            inputs=[question_input, chatbot],
+            outputs=chatbot
+        ).then(
+            lambda: "",
+            None,
+            question_input
+        )
+        clear_button.click(
+            clear_conversation,
+            inputs=[],
+            outputs=[chatbot, upload_status]
+        )
+        # Add CSS for better styling
+        interface.load(
+            js="""
+            () => {
+                document.querySelector('body').style.backgroundColor = '#f7f7f7';
+                document.querySelector('.gradio-container').style.maxWidth = '1200px';
+            }
+            """
+        )
+    return interface
+# Main execution
+if __name__ == "__main__":
+    demo = build_interface()
+    demo.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        debug=True,
+        show_api=False
+    )