Spaces:

TKM03
/

PDF_based_chatbot

Sleeping

App Files Files Community

TKM03 commited on Feb 18, 2025

Commit

a22d089

verified ·

1 Parent(s): 3c45742

done

Browse files

Files changed (1) hide show

app.py +108 -1

app.py CHANGED Viewed

@@ -4,7 +4,114 @@ import faiss
 import re
 import gradio as gr
-# [Previous functions remain exactly the same - preprocess_text, query_qa_system, initialize_qa_system, etc.]
 # Custom CSS for professional styling
 custom_css = """

 import re
 import gradio as gr
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import faiss
+import re
+def preprocess_text(text):
+    """
+    Preprocess the text into structured question-answer pairs
+    """
+    # Split text into sections by questions
+    sections = []
+    current_section = []
+    for line in text.split('\n'):
+        line = line.strip()
+        if line.startswith('Question'):
+            if current_section:
+                sections.append(' '.join(current_section))
+            current_section = [line]
+        elif line:
+            current_section.append(line)
+    if current_section:
+        sections.append(' '.join(current_section))
+    # Create a structured format
+    structured_sections = []
+    for section in sections:
+        # Remove page numbers and other irrelevant text
+        section = re.sub(r'\d+\s*$', '', section)
+        section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section)
+        structured_sections.append(section.strip())
+    return structured_sections
+def create_qa_system(pdf_text, model_name="all-MiniLM-L6-v2"):
+    """
+    Create and return a QA system with the processed text
+    """
+    # Process text into structured sections
+    text_chunks = preprocess_text(pdf_text)
+    # Create embeddings
+    model = SentenceTransformer(model_name)
+    embeddings = model.encode(text_chunks)
+    # Create FAISS index with cosine similarity
+    dimension = embeddings.shape[1]
+    # Normalize vectors for cosine similarity
+    faiss.normalize_L2(embeddings)
+    index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
+    index.add(embeddings)
+    return model, index, text_chunks
+def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.3):
+    """
+    Query the QA system with improved matching
+    """
+    # Encode and normalize the question
+    question_embedding = model.encode([question])
+    faiss.normalize_L2(question_embedding)
+    # Search for the most similar chunks
+    k = 1  # Get only the best match
+    similarities, indices = index.search(question_embedding, k)
+    best_idx = indices[0][0]
+    similarity_score = similarities[0][0]  # Cosine similarity score
+    if similarity_score >= similarity_threshold:
+        matched_text = text_chunks[best_idx]
+        # Extract just the question number for reference
+        question_num = re.search(r'Question \d+:', matched_text)
+        question_num = question_num.group(0) if question_num else "Matching section"
+        return {
+            'question': question_num,
+            'full_text': matched_text,
+            'confidence': float(similarity_score),
+            'found_answer': True
+        }
+    else:
+        return {
+            'question': None,
+            'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.",
+            'confidence': float(similarity_score),
+            'found_answer': False
+        }
+def ask_question(question, model, index, text_chunks):
+    """
+    User-friendly interface for asking questions
+    """
+    result = query_qa_system(question, model, index, text_chunks)
+    print("\nQ:", question)
+    print("-" * 50)
+    if result['found_answer']:
+        print(f"Found matching section (confidence: {result['confidence']:.2f}):")
+        print(f"\n{result['full_text']}\n")
+        return result
+    else:
+        print(result['full_text'])
+        print(f"Best match confidence: {result['confidence']:.2f}")
+        return result
+# Initialize the system
+model, index, text_chunks = create_qa_system(pdf_text)
 # Custom CSS for professional styling
 custom_css = """