Spaces:

heerjtdev
/

answer

Sleeping

App Files Files Community

heerjtdev commited on 23 days ago

Commit

1049876

verified ·

1 Parent(s): d443dc8

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -67

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # import gradio as gr
 # import fitz  # PyMuPDF
 # from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -10,22 +11,32 @@
 # class VectorSystem:
 #     def __init__(self):
 #         self.vector_store = None
-#         # Use a lightweight CPU-friendly model
 #         self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-#     def process_pdf(self, file_obj):
-#         """Extracts text from PDF and builds the Vector Index"""
 #         if file_obj is None:
 #             return "No file uploaded."
 #         try:
-#             # 1. Extract Text
-#             doc = fitz.open(file_obj.name)
 #             text = ""
-#             for page in doc:
-#                 text += page.get_text()
-#             # 2. Split Text into Chunks
 #             text_splitter = RecursiveCharacterTextSplitter(
 #                 chunk_size=800,
 #                 chunk_overlap=150,
@@ -34,28 +45,25 @@
 #             chunks = text_splitter.split_text(text)
 #             if not chunks:
-#                 return "Could not extract text. Is the PDF scanned images?"
 #             # 3. Build Vector Index (FAISS)
 #             self.vector_store = FAISS.from_texts(chunks, self.embeddings)
-#             return f"✅ Success! Indexed {len(chunks)} text chunks from the PDF."
 #         except Exception as e:
-#             return f"Error processing PDF: {str(e)}"
 #     def retrieve_evidence(self, question, student_answer):
-#         """Finds relevant text chunks based on the Question"""
 #         if not self.vector_store:
-#             return "⚠️ Please upload and process a PDF first."
 #         if not question:
 #             return "⚠️ Please enter a Question."
-#         # We search primarily using the Question to find the 'Ground Truth' in the text.
 #         docs = self.vector_store.similarity_search(question, k=3)
-#         # Format the output
 #         output_text = "### 🔍 Relevant Context Found:\n\n"
 #         for i, doc in enumerate(docs):
 #             output_text += f"**Chunk {i+1}:**\n> {doc.page_content}\n\n"
@@ -69,28 +77,26 @@
 # # --- Gradio UI ---
 # with gr.Blocks(title="EduGenius Context Retriever") as demo:
-#     gr.Markdown("# 🎓 EduGenius: PDF Context Retriever")
-#     gr.Markdown("Upload a chapter, ask a question, and see exactly which part of the text proves the answer right or wrong.")
 #     with gr.Row():
 #         with gr.Column(scale=1):
-#             # Step 1: Upload
-#             pdf_input = gr.File(label="1. Upload PDF Chapter", file_types=[".pdf"])
-#             upload_btn = gr.Button("Process PDF", variant="primary")
 #             upload_status = gr.Textbox(label="Status", interactive=False)
 #         with gr.Column(scale=2):
-#             # Step 2: Query
 #             question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
 #             answer_input = gr.Textbox(label="Student Answer (Optional Context)", placeholder="e.g., The heat causes it...")
 #             search_btn = gr.Button("Find Relevant Evidence", variant="secondary")
-#             # Output
 #             evidence_output = gr.Markdown(label="Relevant Text Chunks")
 #     # Event Handlers
 #     upload_btn.click(
-#         fn=system.process_pdf,
 #         inputs=[pdf_input],
 #         outputs=[upload_status]
 #     )
@@ -101,67 +107,66 @@
 #         outputs=[evidence_output]
 #     )
-# # Launch
 # if __name__ == "__main__":
 #     demo.launch()
 import gradio as gr
 import fitz  # PyMuPDF
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings
-import os
-# --- Backend Logic ---
 class VectorSystem:
     def __init__(self):
         self.vector_store = None
         self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
     def process_file(self, file_obj):
-        """Extracts text from PDF OR TXT and builds the Vector Index"""
         if file_obj is None:
             return "No file uploaded."
         try:
             text = ""
             file_path = file_obj.name
-            # --- LOGIC BRANCH: Detect File Type ---
             if file_path.lower().endswith('.pdf'):
-                # Handle PDF
                 doc = fitz.open(file_path)
-                for page in doc:
-                    text += page.get_text()
             elif file_path.lower().endswith('.txt'):
-                # Handle Text File
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    text = f.read()
             else:
                 return "❌ Error: Only .pdf and .txt files are supported."
-            # --------------------------------------
-            # 2. Split Text into Chunks (Logic is identical for both)
             text_splitter = RecursiveCharacterTextSplitter(
                 chunk_size=800,
                 chunk_overlap=150,
                 separators=["\n\n", "\n", ".", " ", ""]
             )
-            chunks = text_splitter.split_text(text)
-            if not chunks:
                 return "Could not extract text. Is the file empty?"
-            # 3. Build Vector Index (FAISS)
-            self.vector_store = FAISS.from_texts(chunks, self.embeddings)
-            return f"✅ Success! Indexed {len(chunks)} text chunks."
         except Exception as e:
             return f"Error processing file: {str(e)}"
@@ -173,50 +178,59 @@ class VectorSystem:
         if not question:
             return "⚠️ Please enter a Question."
-        docs = self.vector_store.similarity_search(question, k=3)
-        output_text = "### 🔍 Relevant Context Found:\n\n"
-        for i, doc in enumerate(docs):
-            output_text += f"**Chunk {i+1}:**\n> {doc.page_content}\n\n"
-        output_text += "---\n*These are the most relevant segments to grade the answer against.*"
         return output_text
 # Initialize System
 system = VectorSystem()
 # --- Gradio UI ---
 with gr.Blocks(title="EduGenius Context Retriever") as demo:
-    gr.Markdown("# 🎓 EduGenius: Context Retriever")
-    gr.Markdown("Upload a Chapter (PDF or TXT), ask a question, and see exactly which part of the text proves the answer right or wrong.")
     with gr.Row():
         with gr.Column(scale=1):
-            # UPDATED: Added ".txt" to file_types and changed label
             pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
             upload_btn = gr.Button("Process File", variant="primary")
             upload_status = gr.Textbox(label="Status", interactive=False)
         with gr.Column(scale=2):
             question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
-            answer_input = gr.Textbox(label="Student Answer (Optional Context)", placeholder="e.g., The heat causes it...")
-            search_btn = gr.Button("Find Relevant Evidence", variant="secondary")
             evidence_output = gr.Markdown(label="Relevant Text Chunks")
-    # Event Handlers
-    upload_btn.click(
-        fn=system.process_file,  # Note: Function name changed
-        inputs=[pdf_input],
-        outputs=[upload_status]
-    )
-    search_btn.click(
-        fn=system.retrieve_evidence,
-        inputs=[question_input, answer_input],
-        outputs=[evidence_output]
-    )
 if __name__ == "__main__":
     demo.launch()

 # import gradio as gr
 # import fitz  # PyMuPDF
 # from langchain_text_splitters import RecursiveCharacterTextSplitter
 # class VectorSystem:
 #     def __init__(self):
 #         self.vector_store = None
 #         self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+#     def process_file(self, file_obj):
+#         """Extracts text from PDF OR TXT and builds the Vector Index"""
 #         if file_obj is None:
 #             return "No file uploaded."
 #         try:
 #             text = ""
+#             file_path = file_obj.name
+#             # --- LOGIC BRANCH: Detect File Type ---
+#             if file_path.lower().endswith('.pdf'):
+#                 # Handle PDF
+#                 doc = fitz.open(file_path)
+#                 for page in doc:
+#                     text += page.get_text()
+#             elif file_path.lower().endswith('.txt'):
+#                 # Handle Text File
+#                 with open(file_path, 'r', encoding='utf-8') as f:
+#                     text = f.read()
+#             else:
+#                 return "❌ Error: Only .pdf and .txt files are supported."
+#             # --------------------------------------
+#             # 2. Split Text into Chunks (Logic is identical for both)
 #             text_splitter = RecursiveCharacterTextSplitter(
 #                 chunk_size=800,
 #                 chunk_overlap=150,
 #             chunks = text_splitter.split_text(text)
 #             if not chunks:
+#                 return "Could not extract text. Is the file empty?"
 #             # 3. Build Vector Index (FAISS)
 #             self.vector_store = FAISS.from_texts(chunks, self.embeddings)
+#             return f"✅ Success! Indexed {len(chunks)} text chunks."
 #         except Exception as e:
+#             return f"Error processing file: {str(e)}"
 #     def retrieve_evidence(self, question, student_answer):
 #         if not self.vector_store:
+#             return "⚠️ Please upload and process a file first."
 #         if not question:
 #             return "⚠️ Please enter a Question."
 #         docs = self.vector_store.similarity_search(question, k=3)
 #         output_text = "### 🔍 Relevant Context Found:\n\n"
 #         for i, doc in enumerate(docs):
 #             output_text += f"**Chunk {i+1}:**\n> {doc.page_content}\n\n"
 # # --- Gradio UI ---
 # with gr.Blocks(title="EduGenius Context Retriever") as demo:
+#     gr.Markdown("# 🎓 EduGenius: Context Retriever")
+#     gr.Markdown("Upload a Chapter (PDF or TXT), ask a question, and see exactly which part of the text proves the answer right or wrong.")
 #     with gr.Row():
 #         with gr.Column(scale=1):
+#             # UPDATED: Added ".txt" to file_types and changed label
+#             pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
+#             upload_btn = gr.Button("Process File", variant="primary")
 #             upload_status = gr.Textbox(label="Status", interactive=False)
 #         with gr.Column(scale=2):
 #             question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
 #             answer_input = gr.Textbox(label="Student Answer (Optional Context)", placeholder="e.g., The heat causes it...")
 #             search_btn = gr.Button("Find Relevant Evidence", variant="secondary")
 #             evidence_output = gr.Markdown(label="Relevant Text Chunks")
 #     # Event Handlers
 #     upload_btn.click(
+#         fn=system.process_file,  # Note: Function name changed
 #         inputs=[pdf_input],
 #         outputs=[upload_status]
 #     )
 #         outputs=[evidence_output]
 #     )
 # if __name__ == "__main__":
 #     demo.launch()
 import gradio as gr
 import fitz  # PyMuPDF
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings
 class VectorSystem:
     def __init__(self):
         self.vector_store = None
         self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+        # NEW: We keep a copy of all chunks in a list so we can access neighbors by index
+        self.all_chunks = []
     def process_file(self, file_obj):
+        """Extracts text, preserves order, and builds the Vector Index"""
         if file_obj is None:
             return "No file uploaded."
         try:
+            # 1. Extract Text
             text = ""
             file_path = file_obj.name
             if file_path.lower().endswith('.pdf'):
                 doc = fitz.open(file_path)
+                for page in doc: text += page.get_text()
             elif file_path.lower().endswith('.txt'):
+                with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
             else:
                 return "❌ Error: Only .pdf and .txt files are supported."
+            # 2. Split Text
             text_splitter = RecursiveCharacterTextSplitter(
                 chunk_size=800,
                 chunk_overlap=150,
                 separators=["\n\n", "\n", ".", " ", ""]
             )
+            # Store chunks in the class so we can look them up by ID later
+            self.all_chunks = text_splitter.split_text(text)
+            if not self.all_chunks:
                 return "Could not extract text. Is the file empty?"
+            # 3. Build Vector Index with ID Metadata
+            # We attach the index ID (0, 1, 2...) to every vector
+            metadatas = [{"id": i} for i in range(len(self.all_chunks))]
+            self.vector_store = FAISS.from_texts(
+                self.all_chunks,
+                self.embeddings,
+                metadatas=metadatas
+            )
+            return f"✅ Success! Indexed {len(self.all_chunks)} chunks."
         except Exception as e:
             return f"Error processing file: {str(e)}"
         if not question:
             return "⚠️ Please enter a Question."
+        # NEW: use 'similarity_search_with_score' to see the numbers
+        # Lower Score = Better Match (L2 Distance)
+        results = self.vector_store.similarity_search_with_score(question, k=3)
+        output_text = "### 🔍 Expanded Context Analysis:\n"
+        for i, (doc, score) in enumerate(results):
+            # Get the ID of the matched chunk
+            chunk_id = doc.metadata['id']
+            # Retrieve Previous and Next chunks from our saved list
+            # We use max/min to ensure we don't crash if it's the first or last chunk
+            prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else "[Start of Text]"
+            next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else "[End of Text]"
+            output_text += f"\n#### 🎯 Match #{i+1} (Distance Score: {score:.4f})\n"
+            output_text += f"*A lower score means a closer match.*\n\n"
+            # Display Preceding Context (Greyed out to show it's context)
+            output_text += f"> **Preceding Context:** ...{prev_chunk[-200:]}\n"
+            # Display The Actual Match (Bold)
+            output_text += f"> **MATCH:** {doc.page_content}\n"
+            # Display Succeeding Context
+            output_text += f"> **Succeeding Context:** {next_chunk[:200]}...\n"
+            output_text += "---\n"
         return output_text
 # Initialize System
 system = VectorSystem()
 # --- Gradio UI ---
 with gr.Blocks(title="EduGenius Context Retriever") as demo:
+    gr.Markdown("# 🎓 EduGenius: Smart Context Retriever")
+    gr.Markdown("Upload a Chapter. This version finds the best match AND shows you the text immediately before and after it.")
     with gr.Row():
         with gr.Column(scale=1):
             pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
             upload_btn = gr.Button("Process File", variant="primary")
             upload_status = gr.Textbox(label="Status", interactive=False)
         with gr.Column(scale=2):
             question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
+            answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
+            search_btn = gr.Button("Find Context + Neighbors", variant="secondary")
             evidence_output = gr.Markdown(label="Relevant Text Chunks")
+    upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
+    search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])
 if __name__ == "__main__":
     demo.launch()