File size: 10,828 Bytes
1049876
3e756d2
6a20046
 
8207b6c
6a20046
 
 
 
 
 
 
f0f0224
1049876
6a20046
d443dc8
1049876
6a20046
 
 
 
1049876
6a20046
d443dc8
6a20046
d443dc8
 
1049876
d443dc8
1049876
d443dc8
 
 
1049876
6a20046
 
 
 
 
f0f0224
1049876
6a20046
1049876
d443dc8
6a20046
1049876
f0f0224
1049876
6a20046
1049876
 
 
 
 
 
 
6a20046
 
d443dc8
6a20046
 
 
d443dc8
f0f0224
6a20046
 
 
f0f0224
1049876
 
f0f0224
3e756d2
f0f0224
1049876
6a20046
f0f0224
 
 
 
1049876
f0f0224
1049876
f0f0224
3e756d2
f0f0224
 
 
1049876
 
 
6a20046
 
 
 
 
 
 
f0f0224
 
6a20046
 
 
d443dc8
 
8207b6c
6a20046
 
 
1049876
f0f0224
6a20046
 
 
1049876
 
6a20046
6ca2c19
f0f0224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257


import gradio as gr
import fitz  # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

class VectorSystem:
    def __init__(self):
        self.vector_store = None
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        # NEW: We keep a copy of all chunks in a list so we can access neighbors by index
        self.all_chunks = []

    def process_file(self, file_obj):
        """Extracts text, preserves order, and builds the Vector Index"""
        if file_obj is None:
            return "No file uploaded."

        try:
            # 1. Extract Text
            text = ""
            file_path = file_obj.name
            
            if file_path.lower().endswith('.pdf'):
                doc = fitz.open(file_path)
                for page in doc: text += page.get_text()
            elif file_path.lower().endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
            else:
                return "❌ Error: Only .pdf and .txt files are supported."

            # 2. Split Text
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=800,
                chunk_overlap=150,
                separators=["\n\n", "\n", ".", " ", ""]
            )
            # Store chunks in the class so we can look them up by ID later
            self.all_chunks = text_splitter.split_text(text)

            if not self.all_chunks:
                return "Could not extract text. Is the file empty?"

            # 3. Build Vector Index with ID Metadata
            # We attach the index ID (0, 1, 2...) to every vector
            metadatas = [{"id": i} for i in range(len(self.all_chunks))]
            
            self.vector_store = FAISS.from_texts(
                self.all_chunks, 
                self.embeddings, 
                metadatas=metadatas
            )
            
            return f"βœ… Success! Indexed {len(self.all_chunks)} chunks."
        
        except Exception as e:
            return f"Error processing file: {str(e)}"

    def retrieve_evidence(self, question, student_answer):
        if not self.vector_store:
            return "⚠️ Please upload and process a file first."

        if not question:
            return "⚠️ Please enter a Question."

        # Lower Score = Better Match
        results = self.vector_store.similarity_search_with_score(question, k=3)
        
        output_text = "### πŸ” Expanded Context Analysis:\n"
        
        for i, (doc, score) in enumerate(results):
            chunk_id = doc.metadata['id']
            
            # Retrieve Previous and Next chunks
            # Logic: If it's the first chunk (ID 0), there is no 'prev', so returns empty string
            prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else "(Start of Text)"
            next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else "(End of Text)"
            
            output_text += f"\n#### 🎯 Match #{i+1} (Distance Score: {score:.4f})\n"
            
            # --- CHANGED HERE: Removed [-200:] and [:200] ---
            
            output_text += f"> **Preceding Context:**\n{prev_chunk}\n\n" 
            output_text += f"> **MATCH:**\n**{doc.page_content}**\n\n"
            output_text += f"> **Succeeding Context:**\n{next_chunk}\n"
            
            output_text += "---\n"

        return output_text

# Initialize System
system = VectorSystem()

# --- Gradio UI ---
with gr.Blocks(title="EduGenius Context Retriever") as demo:
    gr.Markdown("# πŸŽ“ EduGenius: Smart Context Retriever")
    gr.Markdown("Upload a Chapter. This version finds the best match AND shows you the text immediately before and after it.")

    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
            upload_btn = gr.Button("Process File", variant="primary")
            upload_status = gr.Textbox(label="Status", interactive=False)

        with gr.Column(scale=2):
            question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
            answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
            search_btn = gr.Button("Find Context + Neighbors", variant="secondary")
            
            evidence_output = gr.Markdown(label="Relevant Text Chunks")

    upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
    search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])

if __name__ == "__main__":
    demo.launch()









# import gradio as gr
# import fitz  # PyMuPDF
# import numpy as np
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import FAISS
# from langchain_huggingface import HuggingFaceEmbeddings

# class VectorSystem:
#     def __init__(self):
#         self.vector_store = None
#         self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#         self.all_chunks = []

#     def process_file(self, file_obj):
#         """Extracts text, preserves order, and builds the Vector Index"""
#         if file_obj is None:
#             return "No file uploaded."

#         try:
#             # 1. Extract Text
#             text = ""
#             file_path = file_obj.name
            
#             if file_path.lower().endswith('.pdf'):
#                 doc = fitz.open(file_path)
#                 for page in doc: text += page.get_text()
#             elif file_path.lower().endswith('.txt'):
#                 with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
#             else:
#                 return "❌ Error: Only .pdf and .txt files are supported."

#             # 2. Split Text
#             text_splitter = RecursiveCharacterTextSplitter(
#                 chunk_size=800,
#                 chunk_overlap=150,
#                 separators=["\n\n", "\n", ".", " ", ""]
#             )
#             self.all_chunks = text_splitter.split_text(text)

#             if not self.all_chunks:
#                 return "Could not extract text. Is the file empty?"

#             # 3. Build Vector Index with ID Metadata
#             metadatas = [{"id": i} for i in range(len(self.all_chunks))]
            
#             self.vector_store = FAISS.from_texts(
#                 self.all_chunks, 
#                 self.embeddings, 
#                 metadatas=metadatas
#             )
            
#             return f"βœ… Success! Indexed {len(self.all_chunks)} chunks."
        
#         except Exception as e:
#             return f"Error processing file: {str(e)}"

#     def retrieve_evidence(self, question, student_answer):
#         if not self.vector_store:
#             return "⚠️ Please upload and process a file first."
#         if not question:
#             return "⚠️ Please enter a Question."

#         # 1. Get Initial Results (Core Matches)
#         # FAISS returns L2 distance (Lower is better)
#         results = self.vector_store.similarity_search_with_score(question, k=3)
        
#         # We need the vector for the QUESTION to do our own math later
#         q_vector = np.array(self.embeddings.embed_query(question))
        
#         output_text = "### πŸ” Smart Context Analysis:\n"
        
#         for i, (doc, core_score) in enumerate(results):
#             chunk_id = doc.metadata['id']
            
#             # 2. Identify Neighbors
#             prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else ""
#             next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else ""
            
#             # 3. Create the "Super Chunk" (Prev + Core + Next)
#             super_chunk_text = f"{prev_chunk} {doc.page_content} {next_chunk}"
            
#             # 4. Calculate "Super Score" (Re-embedding on the fly)
#             # We embed the Super Chunk and measure distance to Question
#             super_vector = np.array(self.embeddings.embed_query(super_chunk_text))
#             super_score = np.linalg.norm(q_vector - super_vector) # Euclidean Distance
            
#             output_text += f"\n#### 🎯 Match #{i+1}\n"
            
#             # 5. The Logic Test: Does Context Improve the Score?
#             # Remember: LOWER score is BETTER (closer distance)
            
#             if super_score < core_score:
#                 # CASE A: Context Helps! (Distance Reduced)
#                 output_text += f"**βœ… Context Added:** The surrounding text made the match stronger (Score improved from {core_score:.3f} to {super_score:.3f}).\n\n"
#                 output_text += f"> {prev_chunk} **{doc.page_content}** {next_chunk}\n"
#             else:
#                 # CASE B: Context Dilutes! (Distance Increased or Same)
#                 output_text += f"**⏹️ Context Ignored:** Surrounding text was irrelevant or noisy (Score worsened from {core_score:.3f} to {super_score:.3f}). Showing Core Match only.\n\n"
#                 output_text += f"> **{doc.page_content}**\n"
            
#             output_text += "---\n"

#         return output_text

# # Initialize System
# system = VectorSystem()

# # --- Gradio UI ---
# with gr.Blocks(title="EduGenius Context Retriever") as demo:
#     gr.Markdown("# πŸŽ“ EduGenius: Intelligent Context Retriever")
#     gr.Markdown("Upload a Chapter. This system intelligently decides if it needs to read the surrounding paragraphs to answer your question.")

#     with gr.Row():
#         with gr.Column(scale=1):
#             pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
#             upload_btn = gr.Button("Process File", variant="primary")
#             upload_status = gr.Textbox(label="Status", interactive=False)

#         with gr.Column(scale=2):
#             question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
#             answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
#             search_btn = gr.Button("Find Evidence", variant="secondary")
            
#             evidence_output = gr.Markdown(label="Relevant Text Chunks")

#     upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
#     search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])

# if __name__ == "__main__":
#     demo.launch()