Spaces:

Shreyas094
/

RAG_PDF

Runtime error

App Files Files Community

Shreyas094 commited on Oct 20, 2024

Commit

82cd6c2

verified ·

1 Parent(s): 6f299e9

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -56

app.py CHANGED Viewed

@@ -5,17 +5,40 @@ from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
 from langchain.vectorstores import FAISS
 from huggingface_hub import InferenceClient
 import os
 class RAGApplication:
     def __init__(self, hf_api_key):
-        self.hf_api_key = hf_api_key
-        self.vector_store = None
-        self.embeddings = HuggingFaceInferenceAPIEmbeddings(
-            api_key=hf_api_key,
-            model_name="sentence-transformers/all-MiniLM-L6-v2"
-        )
-        self.client = InferenceClient(api_key=hf_api_key)
-        self.conversation_history = []
         self.system_prompt = """You are a precise and accurate PDF summarization assistant. Your role is to:
 1. Provide accurate answers based solely on the provided context
 2. Maintain factual consistency and never hallucinate information
@@ -40,61 +63,88 @@ Answer:"""
     def process_pdf(self, file_path):
         try:
             if file_path is None:
                 return "Please upload a PDF file."
             if not os.path.exists(file_path):
                 return f"File not found: {file_path}"
             # Reset conversation history when new PDF is loaded
             self.conversation_history = []
             # Read PDF directly from the file path
             pdf_reader = PdfReader(file_path)
             text = ""
-            for page in pdf_reader.pages:
-                text += page.extract_text()
             if not text.strip():
                 return "No text could be extracted from the PDF. Please make sure it's not empty or scanned."
             # Split text into chunks
             text_splitter = RecursiveCharacterTextSplitter(
                 chunk_size=1000,
                 chunk_overlap=200,
                 length_function=len
             )
             chunks = text_splitter.split_text(text)
             if not chunks:
                 return "No chunks were created. The PDF might be empty."
             # Create vector store
             self.vector_store = FAISS.from_texts(chunks, self.embeddings)
             return "PDF processed successfully! You can now ask questions about it."
         except Exception as e:
-            return f"Error processing PDF: {str(e)}"
     def generate_response(self, message, history):
         try:
             if self.vector_store is None:
                 return "Please upload and process a PDF first."
             query = message.strip()
             if not query:
                 return "Please enter a question."
             # Search for relevant chunks
             relevant_chunks = self.vector_store.similarity_search(query, k=3)
             context = "\n\n".join([doc.page_content for doc in relevant_chunks])
             # Format conversation history
             conversation_history = "\n".join([
-                f"Q: {q}\nA: {a}" for q, a in history[-3:] if q and a  # Keep last 3 exchanges
             ])
             # Create prompt with system prompt, context, and conversation history
             prompt = self.system_prompt.format(
                 context=context,
                 conversation_history=conversation_history,
@@ -102,59 +152,83 @@ Answer:"""
             )
             # Generate response using Mistral
             response = ""
-            for message in self.client.chat_completion(
-                model="mistralai/Mistral-Nemo-Instruct-2407",
-                messages=[
-                    {"role": "system", "content": prompt},
-                    {"role": "user", "content": query}
-                ],
-                max_tokens=500,
-                stream=True,
-            ):
-                response += message.choices[0].delta.content
             return response
         except Exception as e:
-            return f"Error generating response: {str(e)}"
 # Create Gradio interface
 def create_gradio_interface():
-    # You should never hardcode API keys - use environment variables in production
-    rag = RAGApplication(hf_api_key="your_huggingface_api_key")
-    with gr.Blocks() as demo:
-        gr.Markdown("# PDF Question Answering System")
-        with gr.Row():
-            pdf_input = gr.File(
-                label="Upload PDF",
-                file_types=[".pdf"],
-                type="filepath"
             )
-            process_button = gr.Button("Process PDF")
-            status_output = gr.Textbox(label="Status", interactive=False)
-        process_button.click(
-            fn=rag.process_pdf,
-            inputs=[pdf_input],
-            outputs=[status_output]
-        )
-        chat_interface = gr.ChatInterface(
-            fn=rag.generate_response,
-            title="Chat with your PDF",
-            description="Upload a PDF and ask questions about its contents.",
-            theme="soft",
-            examples=[
-                "What is the main topic of this document?",
-                "Can you summarize the key points?",
-                "What are the main conclusions?",
-            ],
-        )
-    return demo
 if __name__ == "__main__":
-    demo = create_gradio_interface()
-    demo.launch()

 from langchain.vectorstores import FAISS
 from huggingface_hub import InferenceClient
 import os
+import logging
+import traceback
+from datetime import datetime
+# Configure logging
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler(f'rag_app_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
 class RAGApplication:
     def __init__(self, hf_api_key):
+        try:
+            self.hf_api_key = hf_api_key
+            self.vector_store = None
+            logger.info("Initializing HuggingFace embeddings...")
+            self.embeddings = HuggingFaceInferenceAPIEmbeddings(
+                api_key=hf_api_key,
+                model_name="sentence-transformers/all-MiniLM-L6-v2"
+            )
+            logger.info("Initializing HuggingFace client...")
+            self.client = InferenceClient(api_key=hf_api_key)
+            self.conversation_history = []
+            logger.info("RAGApplication initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing RAGApplication: {str(e)}")
+            logger.error(f"Traceback: {traceback.format_exc()}")
+            raise
         self.system_prompt = """You are a precise and accurate PDF summarization assistant. Your role is to:
 1. Provide accurate answers based solely on the provided context
 2. Maintain factual consistency and never hallucinate information
     def process_pdf(self, file_path):
         try:
+            logger.info(f"Starting PDF processing for file: {file_path}")
             if file_path is None:
+                logger.warning("No file provided")
                 return "Please upload a PDF file."
             if not os.path.exists(file_path):
+                logger.error(f"File not found at path: {file_path}")
                 return f"File not found: {file_path}"
             # Reset conversation history when new PDF is loaded
             self.conversation_history = []
+            logger.info("Conversation history reset")
             # Read PDF directly from the file path
+            logger.info("Reading PDF file...")
             pdf_reader = PdfReader(file_path)
             text = ""
+            for i, page in enumerate(pdf_reader.pages):
+                try:
+                    text += page.extract_text()
+                    logger.debug(f"Extracted text from page {i+1}")
+                except Exception as e:
+                    logger.error(f"Error extracting text from page {i+1}: {str(e)}")
             if not text.strip():
+                logger.warning("No text extracted from PDF")
                 return "No text could be extracted from the PDF. Please make sure it's not empty or scanned."
             # Split text into chunks
+            logger.info("Splitting text into chunks...")
             text_splitter = RecursiveCharacterTextSplitter(
                 chunk_size=1000,
                 chunk_overlap=200,
                 length_function=len
             )
             chunks = text_splitter.split_text(text)
+            logger.info(f"Created {len(chunks)} chunks")
             if not chunks:
+                logger.warning("No chunks created from text")
                 return "No chunks were created. The PDF might be empty."
             # Create vector store
+            logger.info("Creating vector store...")
             self.vector_store = FAISS.from_texts(chunks, self.embeddings)
+            logger.info("Vector store created successfully")
             return "PDF processed successfully! You can now ask questions about it."
         except Exception as e:
+            error_msg = f"Error processing PDF: {str(e)}"
+            logger.error(error_msg)
+            logger.error(f"Traceback: {traceback.format_exc()}")
+            return error_msg
     def generate_response(self, message, history):
         try:
+            logger.info(f"Generating response for message: {message}")
             if self.vector_store is None:
+                logger.warning("No vector store available - PDF not processed")
                 return "Please upload and process a PDF first."
             query = message.strip()
             if not query:
+                logger.warning("Empty query received")
                 return "Please enter a question."
             # Search for relevant chunks
+            logger.info("Searching for relevant chunks...")
             relevant_chunks = self.vector_store.similarity_search(query, k=3)
             context = "\n\n".join([doc.page_content for doc in relevant_chunks])
+            logger.debug(f"Found {len(relevant_chunks)} relevant chunks")
             # Format conversation history
+            logger.debug(f"Processing conversation history (length: {len(history)})")
             conversation_history = "\n".join([
+                f"Q: {q}\nA: {a}" for q, a in history[-3:] if q and a
             ])
             # Create prompt with system prompt, context, and conversation history
+            logger.debug("Creating prompt...")
             prompt = self.system_prompt.format(
                 context=context,
                 conversation_history=conversation_history,
             )
             # Generate response using Mistral
+            logger.info("Generating response using Mistral...")
             response = ""
+            try:
+                for message in self.client.chat_completion(
+                    model="mistralai/Mistral-Nemo-Instruct-2407",
+                    messages=[
+                        {"role": "system", "content": prompt},
+                        {"role": "user", "content": query}
+                    ],
+                    max_tokens=500,
+                    stream=True,
+                ):
+                    response += message.choices[0].delta.content
+                logger.info("Response generated successfully")
+            except Exception as e:
+                logger.error(f"Error in chat completion: {str(e)}")
+                raise
             return response
         except Exception as e:
+            error_msg = f"Error generating response: {str(e)}"
+            logger.error(error_msg)
+            logger.error(f"Traceback: {traceback.format_exc()}")
+            return error_msg
 # Create Gradio interface
 def create_gradio_interface():
+    try:
+        logger.info("Creating Gradio interface...")
+        # You should never hardcode API keys - use environment variables in production
+        api_key = os.getenv("HUGGINGFACE_API_KEY", "your_huggingface_api_key")
+        rag = RAGApplication(hf_api_key=api_key)
+        with gr.Blocks() as demo:
+            gr.Markdown("# PDF Question Answering System")
+            with gr.Row():
+                pdf_input = gr.File(
+                    label="Upload PDF",
+                    file_types=[".pdf"],
+                    type="filepath"
+                )
+                process_button = gr.Button("Process PDF")
+                status_output = gr.Textbox(label="Status", interactive=False)
+            process_button.click(
+                fn=rag.process_pdf,
+                inputs=[pdf_input],
+                outputs=[status_output]
+            )
+            chat_interface = gr.ChatInterface(
+                fn=rag.generate_response,
+                title="Chat with your PDF",
+                description="Upload a PDF and ask questions about its contents.",
+                theme="soft",
+                examples=[
+                    "What is the main topic of this document?",
+                    "Can you summarize the key points?",
+                    "What are the main conclusions?",
+                ],
             )
+        logger.info("Gradio interface created successfully")
+        return demo
+    except Exception as e:
+        logger.error(f"Error creating Gradio interface: {str(e)}")
+        logger.error(f"Traceback: {traceback.format_exc()}")
+        raise
 if __name__ == "__main__":
+    try:
+        logger.info("Starting application...")
+        demo = create_gradio_interface()
+        logger.info("Launching Gradio interface...")
+        demo.launch()
+    except Exception as e:
+        logger.error(f"Application failed to start: {str(e)}")
+        logger.error(f"Traceback: {traceback.format_exc()}")
+        raise