Spaces:

Prashanthsrn
/

RAG_POC

Build error

App Files Files Community

Prashanthsrn commited on Oct 11, 2024

Commit

ae03db9

verified ·

1 Parent(s): 912f1b8

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -27

app.py CHANGED Viewed

@@ -4,6 +4,12 @@ import numpy as np
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 from sklearn.metrics.pairwise import cosine_similarity
 # Global variables to store models and processed data
 model = None
@@ -13,38 +19,88 @@ embeddings = None
 def load_models():
     global model, generator
-    if model is None or generator is None:
         model = SentenceTransformer('all-MiniLM-L6-v2')
         generator = pipeline('text-generation', model='facebook/bart-large-cnn')
-    return "Models loaded successfully!"
 def extract_text_from_pdf(file):
-    global chunks, embeddings
     if file is None:
-        return "Please upload a PDF file."
     try:
         pdf_reader = PyPDF2.PdfReader(file)
         full_text = ""
         for page in pdf_reader.pages:
-            full_text += page.extract_text()
         # Split text into chunks
-        chunks = [full_text[i:i + 512] for i in range(0, len(full_text), 512)]
         # Generate embeddings
         embeddings = model.encode(chunks)
-        return f"PDF processed successfully! Extracted {len(chunks)} text chunks."
     except Exception as e:
-        return f"Error processing PDF: {str(e)}"
 def answer_question(question):
     if not chunks or embeddings is None:
-        return "Please upload a PDF document first."
     if not question:
-        return "Please enter a question."
     try:
         # Embed the question
@@ -56,36 +112,47 @@ def answer_question(question):
         context = chunks[most_similar_idx]
         # Generate answer
-        prompt = f"Question: {question}\nContext: {context}"
         response = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
         return response
     except Exception as e:
-        return f"Error generating answer: {str(e)}"
 # Create the Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# RAG Chatbot using Sentence-BERT and BART")
     with gr.Row():
-        with gr.Column():
-            load_button = gr.Button("Load Models")
-            model_status = gr.Textbox(label="Model Status")
-            load_button.click(load_models, outputs=model_status)
     with gr.Row():
-        with gr.Column():
-            pdf_input = gr.File(label="Upload PDF")
-            pdf_status = gr.Textbox(label="PDF Status")
-            pdf_input.change(extract_text_from_pdf, inputs=pdf_input, outputs=pdf_status)
     with gr.Row():
-        with gr.Column():
-            question_input = gr.Textbox(label="Ask a question about the PDF")
-            answer_output = gr.Textbox(label="Answer")
-            question_button = gr.Button("Get Answer")
-    question_button.click(answer_question, inputs=question_input, outputs=answer_output)
 # Launch the app
 demo.launch()

 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 from sklearn.metrics.pairwise import cosine_similarity
+import logging
+import re
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Global variables to store models and processed data
 model = None
 def load_models():
     global model, generator
+    try:
         model = SentenceTransformer('all-MiniLM-L6-v2')
         generator = pipeline('text-generation', model='facebook/bart-large-cnn')
+        return "✅ Models loaded successfully!"
+    except Exception as e:
+        logger.error(f"Error loading models: {e}")
+        return f"❌ Error loading models: {str(e)}"
+def clean_text(text):
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters and digits
+    text = re.sub(r'[^\w\s]', '', text)
+    return text.strip()
+def split_text(text, chunk_size=512):
+    # Split text into sentences (crude approximation)
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) < chunk_size:
+            current_chunk += sentence + " "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
 def extract_text_from_pdf(file):
+    global chunks, embeddings, model
+    if model is None:
+        return "❌ Please load the models first."
     if file is None:
+        return "❌ Please upload a PDF file."
     try:
         pdf_reader = PyPDF2.PdfReader(file)
         full_text = ""
         for page in pdf_reader.pages:
+            text = page.extract_text()
+            if text:
+                cleaned_text = clean_text(text)
+                if cleaned_text:
+                    full_text += cleaned_text + " "
+        if not full_text.strip():
+            return "❌ No readable text found in the PDF. The file might be scanned or contain only images."
         # Split text into chunks
+        chunks = split_text(full_text)
+        if not chunks:
+            return "❌ Could not create meaningful text chunks from the PDF."
         # Generate embeddings
         embeddings = model.encode(chunks)
+        return f"✅ PDF processed successfully! Extracted {len(chunks)} text chunks."
     except Exception as e:
+        logger.error(f"Error processing PDF: {e}")
+        return f"❌ Error processing PDF: {str(e)}"
 def answer_question(question):
+    global model, generator, chunks, embeddings
+    if model is None or generator is None:
+        return "❌ Please load the models first."
     if not chunks or embeddings is None:
+        return "❌ Please upload and process a PDF document first."
     if not question:
+        return "❌ Please enter a question."
     try:
         # Embed the question
         context = chunks[most_similar_idx]
         # Generate answer
+        prompt = f"Question: {question}\nContext: {context}\nAnswer:"
         response = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
         return response
     except Exception as e:
+        logger.error(f"Error generating answer: {e}")
+        return f"❌ Error generating answer: {str(e)}"
 # Create the Gradio interface
+with gr.Blocks(title="PDF Q&A Bot") as demo:
+    gr.Markdown("# PDF Question-Answering Bot")
+    gr.Markdown("### Step 1: Load the necessary models")
     with gr.Row():
+        load_button = gr.Button("1️⃣ Load Models", variant="primary")
+        model_status = gr.Textbox(label="Model Status", interactive=False)
+    gr.Markdown("### Step 2: Upload a PDF document")
     with gr.Row():
+        pdf_input = gr.File(label="2️⃣ Upload PDF")
+        pdf_status = gr.Textbox(label="PDF Status", interactive=False)
+    gr.Markdown("### Step 3: Ask questions about the document")
     with gr.Row():
+        question_input = gr.Textbox(label="3️⃣ Ask a question about the PDF")
+        answer_button = gr.Button("Get Answer", variant="primary")
+        answer_output = gr.Textbox(label="Answer", interactive=False)
+    # Event handlers
+    load_button.click(load_models, outputs=model_status)
+    pdf_input.change(extract_text_from_pdf, inputs=pdf_input, outputs=pdf_status)
+    answer_button.click(answer_question, inputs=question_input, outputs=answer_output)
+    gr.Markdown("""
+    ## How to use:
+    1. Click 'Load Models' and wait for confirmation
+    2. Upload a PDF document and wait for it to be processed
+    3. Type your question and click 'Get Answer'
+    Note: This tool works best with PDFs that contain readable text. It may not work well with scanned documents or PDFs that are primarily images.
+    """)
 # Launch the app
 demo.launch()