Spaces:

heerjtdev
/

answer

Sleeping

File size: 10,828 Bytes



import gradio as gr
import fitz  # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

class VectorSystem:
    def __init__(self):
        self.vector_store = None
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        # NEW: We keep a copy of all chunks in a list so we can access neighbors by index
        self.all_chunks = []

    def process_file(self, file_obj):
        """Extracts text, preserves order, and builds the Vector Index"""
        if file_obj is None:
            return "No file uploaded."

        try:
            # 1. Extract Text
            text = ""
            file_path = file_obj.name
            
            if file_path.lower().endswith('.pdf'):
                doc = fitz.open(file_path)
                for page in doc: text += page.get_text()
            elif file_path.lower().endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
            else:
                return "❌ Error: Only .pdf and .txt files are supported."

            # 2. Split Text
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=800,
                chunk_overlap=150,
                separators=["\n\n", "\n", ".", " ", ""]
            )
            # Store chunks in the class so we can look them up by ID later
            self.all_chunks = text_splitter.split_text(text)

            if not self.all_chunks:
                return "Could not extract text. Is the file empty?"

            # 3. Build Vector Index with ID Metadata
            # We attach the index ID (0, 1, 2...) to every vector
            metadatas = [{"id": i} for i in range(len(self.all_chunks))]
            
            self.vector_store = FAISS.from_texts(
                self.all_chunks, 
                self.embeddings, 
                metadatas=metadatas
            )
            
            return f"✅ Success! Indexed {len(self.all_chunks)} chunks."
        
        except Exception as e:
            return f"Error processing file: {str(e)}"

    def retrieve_evidence(self, question, student_answer):
        if not self.vector_store:
            return "⚠️ Please upload and process a file first."

        if not question:
            return "⚠️ Please enter a Question."

        # Lower Score = Better Match
        results = self.vector_store.similarity_search_with_score(question, k=3)
        
        output_text = "### 🔍 Expanded Context Analysis:\n"
        
        for i, (doc, score) in enumerate(results):
            chunk_id = doc.metadata['id']
            
            # Retrieve Previous and Next chunks
            # Logic: If it's the first chunk (ID 0), there is no 'prev', so returns empty string
            prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else "(Start of Text)"
            next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else "(End of Text)"
            
            output_text += f"\n#### 🎯 Match #{i+1} (Distance Score: {score:.4f})\n"
            
            # --- CHANGED HERE: Removed [-200:] and [:200] ---
            
            output_text += f"> **Preceding Context:**\n{prev_chunk}\n\n" 
            output_text += f"> **MATCH:**\n**{doc.page_content}**\n\n"
            output_text += f"> **Succeeding Context:**\n{next_chunk}\n"
            
            output_text += "---\n"

        return output_text

# Initialize System
system = VectorSystem()

# --- Gradio UI ---
with gr.Blocks(title="EduGenius Context Retriever") as demo:
    gr.Markdown("# 🎓 EduGenius: Smart Context Retriever")
    gr.Markdown("Upload a Chapter. This version finds the best match AND shows you the text immediately before and after it.")

    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
            upload_btn = gr.Button("Process File", variant="primary")
            upload_status = gr.Textbox(label="Status", interactive=False)

        with gr.Column(scale=2):
            question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
            answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
            search_btn = gr.Button("Find Context + Neighbors", variant="secondary")
            
            evidence_output = gr.Markdown(label="Relevant Text Chunks")

    upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
    search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])

if __name__ == "__main__":
    demo.launch()









# import gradio as gr
# import fitz  # PyMuPDF
# import numpy as np
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import FAISS
# from langchain_huggingface import HuggingFaceEmbeddings

# class VectorSystem:
#     def __init__(self):
#         self.vector_store = None
#         self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#         self.all_chunks = []

#     def process_file(self, file_obj):
#         """Extracts text, preserves order, and builds the Vector Index"""
#         if file_obj is None:
#             return "No file uploaded."

#         try:
#             # 1. Extract Text
#             text = ""
#             file_path = file_obj.name
            
#             if file_path.lower().endswith('.pdf'):
#                 doc = fitz.open(file_path)
#                 for page in doc: text += page.get_text()
#             elif file_path.lower().endswith('.txt'):
#                 with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
#             else:
#                 return "❌ Error: Only .pdf and .txt files are supported."

#             # 2. Split Text
#             text_splitter = RecursiveCharacterTextSplitter(
#                 chunk_size=800,
#                 chunk_overlap=150,
#                 separators=["\n\n", "\n", ".", " ", ""]
#             )
#             self.all_chunks = text_splitter.split_text(text)

#             if not self.all_chunks:
#                 return "Could not extract text. Is the file empty?"

#             # 3. Build Vector Index with ID Metadata
#             metadatas = [{"id": i} for i in range(len(self.all_chunks))]
            
#             self.vector_store = FAISS.from_texts(
#                 self.all_chunks, 
#                 self.embeddings, 
#                 metadatas=metadatas
#             )
            
#             return f"✅ Success! Indexed {len(self.all_chunks)} chunks."
        
#         except Exception as e:
#             return f"Error processing file: {str(e)}"

#     def retrieve_evidence(self, question, student_answer):
#         if not self.vector_store:
#             return "⚠️ Please upload and process a file first."
#         if not question:
#             return "⚠️ Please enter a Question."

#         # 1. Get Initial Results (Core Matches)
#         # FAISS returns L2 distance (Lower is better)
#         results = self.vector_store.similarity_search_with_score(question, k=3)
        
#         # We need the vector for the QUESTION to do our own math later
#         q_vector = np.array(self.embeddings.embed_query(question))
        
#         output_text = "### 🔍 Smart Context Analysis:\n"
        
#         for i, (doc, core_score) in enumerate(results):
#             chunk_id = doc.metadata['id']
            
#             # 2. Identify Neighbors
#             prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else ""
#             next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else ""
            
#             # 3. Create the "Super Chunk" (Prev + Core + Next)
#             super_chunk_text = f"{prev_chunk} {doc.page_content} {next_chunk}"
            
#             # 4. Calculate "Super Score" (Re-embedding on the fly)
#             # We embed the Super Chunk and measure distance to Question
#             super_vector = np.array(self.embeddings.embed_query(super_chunk_text))
#             super_score = np.linalg.norm(q_vector - super_vector) # Euclidean Distance
            
#             output_text += f"\n#### 🎯 Match #{i+1}\n"
            
#             # 5. The Logic Test: Does Context Improve the Score?
#             # Remember: LOWER score is BETTER (closer distance)
            
#             if super_score < core_score:
#                 # CASE A: Context Helps! (Distance Reduced)
#                 output_text += f"**✅ Context Added:** The surrounding text made the match stronger (Score improved from {core_score:.3f} to {super_score:.3f}).\n\n"
#                 output_text += f"> {prev_chunk} **{doc.page_content}** {next_chunk}\n"
#             else:
#                 # CASE B: Context Dilutes! (Distance Increased or Same)
#                 output_text += f"**⏹️ Context Ignored:** Surrounding text was irrelevant or noisy (Score worsened from {core_score:.3f} to {super_score:.3f}). Showing Core Match only.\n\n"
#                 output_text += f"> **{doc.page_content}**\n"
            
#             output_text += "---\n"

#         return output_text

# # Initialize System
# system = VectorSystem()

# # --- Gradio UI ---
# with gr.Blocks(title="EduGenius Context Retriever") as demo:
#     gr.Markdown("# 🎓 EduGenius: Intelligent Context Retriever")
#     gr.Markdown("Upload a Chapter. This system intelligently decides if it needs to read the surrounding paragraphs to answer your question.")

#     with gr.Row():
#         with gr.Column(scale=1):
#             pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
#             upload_btn = gr.Button("Process File", variant="primary")
#             upload_status = gr.Textbox(label="Status", interactive=False)

#         with gr.Column(scale=2):
#             question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
#             answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
#             search_btn = gr.Button("Find Evidence", variant="secondary")
            
#             evidence_output = gr.Markdown(label="Relevant Text Chunks")

#     upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
#     search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])

# if __name__ == "__main__":
#     demo.launch()