File size: 21,244 Bytes
7e3ebf5
 
 
 
 
 
 
 
 
 
 
 
 
4c0ad61
 
 
 
7e3ebf5
 
 
 
 
4c0ad61
 
 
 
 
 
7e3ebf5
 
 
4c0ad61
 
 
 
7e3ebf5
4c0ad61
7e3ebf5
 
 
 
 
 
 
4c0ad61
7e3ebf5
 
 
 
 
 
 
4c0ad61
 
 
 
 
7e3ebf5
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
4c0ad61
 
 
 
 
7e3ebf5
4c0ad61
7e3ebf5
 
 
 
 
4c0ad61
 
 
 
 
7e3ebf5
4c0ad61
7e3ebf5
 
 
 
 
 
4c0ad61
 
 
 
 
 
7e3ebf5
4c0ad61
7e3ebf5
 
 
 
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0ad61
 
 
 
 
 
7e3ebf5
4c0ad61
7e3ebf5
 
4c0ad61
7e3ebf5
 
4c0ad61
7e3ebf5
 
 
 
 
 
4c0ad61
7e3ebf5
 
4c0ad61
7e3ebf5
 
 
 
 
 
4c0ad61
 
 
 
 
 
7e3ebf5
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0ad61
7e3ebf5
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
 
 
 
 
4c0ad61
7e3ebf5
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0ad61
 
 
 
 
7e3ebf5
 
 
 
 
 
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
 
4c0ad61
7e3ebf5
 
4c0ad61
7e3ebf5
 
4c0ad61
 
 
 
 
7e3ebf5
4c0ad61
7e3ebf5
 
4c0ad61
7e3ebf5
 
 
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
4c0ad61
7e3ebf5
 
 
4c0ad61
 
 
 
 
 
7e3ebf5
 
4c0ad61
 
7e3ebf5
 
4c0ad61
7e3ebf5
 
4c0ad61
7e3ebf5
4c0ad61
 
7e3ebf5
 
4c0ad61
 
7e3ebf5
 
 
 
 
 
4c0ad61
 
7e3ebf5
 
 
 
 
 
 
4c0ad61
 
7e3ebf5
 
4c0ad61
7e3ebf5
4c0ad61
 
7e3ebf5
 
4c0ad61
7e3ebf5
 
 
 
 
 
 
 
 
 
 
 
4c0ad61
7e3ebf5
 
 
 
4c0ad61
 
7e3ebf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0ad61
 
7e3ebf5
4c0ad61
7e3ebf5
 
4c0ad61
7e3ebf5
4c0ad61
7e3ebf5
 
4c0ad61
7e3ebf5
 
 
 
4c0ad61
7e3ebf5
 
 
 
 
4c0ad61
7e3ebf5
 
 
 
4c0ad61
7e3ebf5
 
 
4c0ad61
7e3ebf5
 
 
 
 
4c0ad61
 
 
 
 
49da153
 
 
 
7e3ebf5
 
 
4c0ad61
7e3ebf5
 
 
 
 
4c0ad61
 
7e3ebf5
 
 
 
 
4c0ad61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e3ebf5
4c0ad61
 
 
 
 
 
 
 
 
 
 
7e3ebf5
 
4c0ad61
7e3ebf5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
import pymupdf
import pytesseract
from PIL import Image
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import gradio as gr
from supabase import create_client, Client
import uuid
import hashlib
from openai import OpenAI

# =============================================================================
# CONNECTIONS: Read API keys from HF Secrets (environment variables)
# Set these in your Space: Settings > Variables and secrets
# =============================================================================
supabase: Client = create_client(
    os.getenv("SUPABASE_URL"),
    os.getenv("SUPABASE_ANON_KEY")
)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# =============================================================================
# MODEL: Load the sentence transformer for semantic search
# This runs once on startup. It finds which text chunks are most relevant
# to the user's question before sending them to GPT.
# =============================================================================
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded!")

# =============================================================================
# FILE PROCESSING: Extract raw text from uploaded PDFs and images
# =============================================================================

def extract_text_from_pdf(file_path):
    """Opens a PDF and concatenates all page text into one string."""
    doc = pymupdf.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def extract_text_from_image(image_path):
    """Uses Tesseract OCR to extract text from an image file."""
    try:
        img = Image.open(image_path)
        extracted_text = pytesseract.image_to_string(img)
        return extracted_text.strip()
    except Exception as e:
        return f"Error extracting text from image: {e}"

# =============================================================================
# TEXT CHUNKING: Break long documents into overlapping pieces
# Overlap ensures we don't cut off a sentence right at a chunk boundary
# =============================================================================

def chunk_text(text, chunk_size=1000, overlap=200):
    """Splits text into overlapping chunks for semantic search."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# =============================================================================
# SEMANTIC SEARCH: Find the 3 most relevant chunks for the question
# Uses cosine similarity between the question embedding and chunk embeddings
# =============================================================================

def search_relevant_chunks(query, chunks, embeddings):
    """Returns the top 3 chunks most semantically similar to the query."""
    query_vec = model.encode([query])
    similarities = cosine_similarity(query_vec, embeddings)[0]
    top_indices = np.argsort(similarities)[-3:][::-1]
    return [chunks[i] for i in top_indices]

# =============================================================================
# FILE HASHING: Create a unique fingerprint for each uploaded file
# Used to track which file was used in a chat session
# =============================================================================

def get_file_hash(file_path):
    """Returns an MD5 hash of the file contents."""
    try:
        with open(file_path, "rb") as f:
            return hashlib.md5(f.read()).hexdigest()
    except:
        return None

# =============================================================================
# AI ANSWER: Send question + context to GPT-4o-mini
# Uses Socratic method: guides the student rather than just giving answers
# If no file is uploaded, answers from general knowledge
# =============================================================================

def generate_answer(question, context):
    """Generates a Socratic/Feynman-style answer using GPT-4o-mini."""
    if "No document provided" in context:
        system_prompt = "You are a helpful academic math tutor. Use the Socratic method to guide the student."
    else:
        system_prompt = f"You are an academic assistant. Based only on the following context, answer the question:\n{context}"

    prompt = f"""
{system_prompt}

Give me the output without latex format.
Use the socratic/feynman method for learning.

Question:
{question}

Answer:
"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content.strip()

# =============================================================================
# CHAT WITH FILE: Main RAG pipeline
# Combines file reading, chunking, search, and answer generation
# Falls back to general knowledge if no file is uploaded
# =============================================================================

def chat_with_file(question, file):
    """Runs the full RAG pipeline: extract, chunk, search, answer."""
    if file is None:
        return generate_answer(question, context="No document provided. Answer from general knowledge.")

    file_path = file.name
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == ".pdf":
        text = extract_text_from_pdf(file_path)
    elif file_extension in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff"]:
        text = extract_text_from_image(file_path)
    else:
        return "Unsupported file type. Please upload a PDF or image file."

    if not text.strip():
        return "No text could be extracted from the file."

    chunks = chunk_text(text)
    embeddings = model.encode(chunks)
    top_chunks = search_relevant_chunks(question, chunks, embeddings)
    combined_context = "\n\n".join(top_chunks)
    return generate_answer(question, combined_context)

# =============================================================================
# DATABASE: Save and load chat history from Supabase
# Each message is stored with user_id, session_id, question, and answer
# Sessions allow users to revisit past conversations
# =============================================================================

def save_chat_to_db(user_id, session_id, question, answer, file_name=None, file_hash=None):
    """Saves a single Q&A exchange to the chat_history table."""
    try:
        supabase.table("chat_history").insert({
            "user_id": user_id,
            "session_id": session_id,
            "question": question,
            "answer": answer,
            "file_name": file_name,
            "file_hash": file_hash
        }).execute()
        return True
    except Exception as e:
        print(f"Error saving chat: {e}")
        return False

def load_chat_history(user_id, session_id=None, limit=50):
    """Loads chat history for a user, optionally filtered by session."""
    try:
        query = supabase.table("chat_history")            .select("*")            .eq("user_id", user_id)            .order("created_at", desc=False)            .limit(limit)
        if session_id:
            query = query.eq("session_id", session_id)
        response = query.execute()
        history = []
        for msg in response.data:
            history.append([msg["question"], msg["answer"]])
        return history
    except Exception as e:
        print(f"Error loading history: {e}")
        return []

def get_user_sessions(user_id, limit=10):
    """Returns a deduplicated list of recent sessions for a user."""
    try:
        response = supabase.table("chat_history")            .select("session_id, created_at, file_name")            .eq("user_id", user_id)            .order("created_at", desc=True)            .limit(limit * 5)            .execute()
        sessions = {}
        for msg in response.data:
            sid = msg["session_id"]
            if sid not in sessions:
                sessions[sid] = {
                    "session_id": sid,
                    "created_at": msg["created_at"],
                    "file_name": msg.get("file_name", "No file")
                }
        return list(sessions.values())[:limit]
    except Exception as e:
        print(f"Error loading sessions: {e}")
        return []

# =============================================================================
# AUTH MANAGER: Handles signup, login, and logout via Supabase Auth
# Stores the current user and session ID in memory while the app is running
# =============================================================================

class AuthManager:
    def __init__(self):
        self.current_user = None
        self.session_id = None

    def signup(self, email, password, username):
        """Creates a new Supabase Auth user with username in metadata."""
        try:
            response = supabase.auth.sign_up({
                "email": email,
                "password": password,
                "options": {"data": {"username": username}}
            })
            if response.user:
                return True, "Account created! Please check your email to verify."
            else:
                return False, "Signup failed"
        except Exception as e:
            error_msg = str(e)
            if "duplicate" in error_msg.lower() or "unique" in error_msg.lower():
                return False, "Username or email already exists"
            return False, f"Error: {error_msg}"

    def login(self, email, password):
        """Signs in with email and password, returns user ID on success."""
        try:
            response = supabase.auth.sign_in_with_password({
                "email": email,
                "password": password
            })
            if response.user:
                self.current_user = response.user
                self.session_id = str(uuid.uuid4())
                profile = supabase.table("user_profiles")                    .select("username")                    .eq("id", response.user.id)                    .execute()
                username = profile.data[0]["username"] if profile.data else "User"
                return True, f"Welcome back, {username}!", response.user.id
            else:
                return False, "Invalid credentials", None
        except Exception as e:
            return False, f"Login error: {str(e)}", None

    def logout(self):
        """Signs out and clears local user state."""
        try:
            supabase.auth.sign_out()
            self.current_user = None
            self.session_id = None
            return True, "Logged out successfully"
        except Exception as e:
            return False, f"Logout error: {str(e)}"

    def is_authenticated(self):
        """Returns True if a user is currently logged in."""
        return self.current_user is not None

# Create a single global auth manager instance
auth = AuthManager()

# =============================================================================
# CHAT HANDLER: Combines chat_with_file with database saving
# Requires the user to be logged in before processing
# =============================================================================

def chat_with_file_and_save(question, file, history, user_id, session_id):
    """Processes a question, saves the result to DB, updates chat display."""
    if not auth.is_authenticated():
        return history + [["", "Please login to use the chatbot."]], "", None

    answer = chat_with_file(question, file)
    file_name = os.path.basename(file.name) if file else None
    file_hash = get_file_hash(file.name) if file else None

    save_chat_to_db(
        user_id=user_id,
        session_id=session_id,
        question=question,
        answer=answer,
        file_name=file_name,
        file_hash=file_hash
    )

    history = history + [[question, answer]]
    return history, "", None

# =============================================================================
# GRADIO INTERFACE: Full UI with two tabs
#   Tab 1: Login / Signup
#   Tab 2: Chat with file upload, session history, and session loader
# =============================================================================

def create_interface():
    with gr.Blocks(title="Math Tutor Chatbot", theme=gr.themes.Soft()) as demo:

        # Hidden state: stores user ID and session ID across interactions
        user_id_state = gr.State(None)
        session_id_state = gr.State(None)

        gr.Markdown("# Math Tutor Chatbot")
        gr.Markdown("Create an account to save your chat history and get Socratic math tutoring!")

        with gr.Tabs() as tabs:

            # ── TAB 1: Login and Signup ────────────────────────────────────
            with gr.Tab("Login / Sign Up", id="login_tab"):
                with gr.Row():

                    # Left side: Login
                    with gr.Column():
                        gr.Markdown("### Login to Existing Account")
                        login_email = gr.Textbox(label="Email", placeholder="you@example.com")
                        login_password = gr.Textbox(label="Password", type="password")
                        login_btn = gr.Button("Login", variant="primary", size="lg")
                        login_msg = gr.Markdown("")

                    # Right side: Signup
                    with gr.Column():
                        gr.Markdown("### Create New Account")
                        signup_email = gr.Textbox(label="Email", placeholder="you@example.com")
                        signup_username = gr.Textbox(label="Username", placeholder="cool_username")
                        signup_password = gr.Textbox(label="Password", type="password")
                        signup_btn = gr.Button("Sign Up", variant="primary", size="lg")
                        signup_msg = gr.Markdown("")

            # ── TAB 2: Chat ────────────────────────────────────────────────
            with gr.Tab("Chat", id="chat_tab"):
                gr.Markdown("### Upload a PDF or image and ask questions!")

                with gr.Row():

                    # Left: Chat area
                    with gr.Column(scale=3):
                        chatbot = gr.Chatbot(label="Conversation", height=500, type="tuples")

                        with gr.Row():
                            question_input = gr.Textbox(
                                show_label=False,
                                placeholder="Ask a math question or about your uploaded file...",
                                scale=6
                            )
                            file_input = gr.File(
                                label="Attach",
                                file_types=[".pdf", ".png", ".jpg", ".jpeg"],
                                scale=1
                            )
                            send_btn = gr.Button("Send", scale=1, variant="primary")

                        with gr.Row():
                            new_session_btn = gr.Button("New Session", size="sm")
                            clear_btn = gr.Button("Clear Chat", size="sm")
                            logout_btn = gr.Button("Logout", size="sm")

                    # Right: Session history panel
                    with gr.Column(scale=1):
                        gr.Markdown("### Your Past Sessions")
                        sessions_display = gr.Dataframe(
                            headers=["Date", "File"],
                            datatype=["str", "str"],
                            interactive=False,
                            row_count=5
                        )
                        refresh_sessions_btn = gr.Button("Refresh Sessions", size="sm")
                        gr.Markdown("**Load a Previous Session:**")
                        session_dropdown = gr.Dropdown(
                            label="Select Session",
                            choices=[],
                            interactive=True,
                            value=None
                        )
                        load_session_btn = gr.Button("Load Selected Session", size="sm", variant="primary")

        # ── EVENT HANDLERS ─────────────────────────────────────────────────

        def handle_login(email, password):
            """Logs in and switches to the chat tab on success."""
            success, message, uid = auth.login(email, password)
            if success:
                return message, uid, str(uuid.uuid4()), gr.update(selected="chat_tab")
            else:
                return message, None, None, gr.update()

        def handle_signup(email, password, username):
            """Creates a new account and returns a status message."""
            success, message = auth.signup(email, password, username)
            return message

        def handle_send(question, file, history, user_id, session_id):
            """Sends the question through the RAG pipeline and saves result."""
            if not user_id:
                return history + [["", "Please login first!"]], "", None
            return chat_with_file_and_save(question, file, history, user_id, session_id)

        def handle_logout():
            """Logs out and switches back to the login tab."""
            auth.logout()
            return [], "Logged out successfully", None, None, gr.update(selected="login_tab")

        def handle_new_session(user_id):
            """Clears the chat and generates a fresh session ID."""
            return [], str(uuid.uuid4())

        def handle_refresh_sessions(user_id):
            """Loads recent sessions from DB and populates the dropdown."""
            if not user_id:
                return [["Login first", ""]], []
            sessions = get_user_sessions(user_id, limit=20)
            if not sessions:
                return [["No sessions yet", ""]], []
            df_data = [
                [s["created_at"][:19], s["file_name"] or "No file"]
                for s in sessions
            ]
            # Using .format() instead of f-strings to avoid quote conflicts
            dropdown_choices = [
                "{} - {}".format(s["created_at"][:19], (s["file_name"] or "No file")[:20])
                for s in sessions
            ]
            return df_data, gr.update(choices=dropdown_choices, value=None)

        def handle_load_session(user_id, selected_session_dropdown):
            """Loads a previously selected session into the chat window."""
            if not user_id or not selected_session_dropdown:
                return [], None, "Select a session first"
            sessions = get_user_sessions(user_id, limit=20)
            selected_date = selected_session_dropdown.split(" - ")[0]
            matching_session = next(
                (s["session_id"] for s in sessions if s["created_at"][:19] == selected_date),
                None
            )
            if matching_session:
                return load_chat_history(user_id, matching_session), matching_session, "Session loaded!"
            return [], None, "Session not found"

        # ── WIRE UP BUTTONS TO HANDLERS ────────────────────────────────────

        login_btn.click(
            fn=handle_login,
            inputs=[login_email, login_password],
            outputs=[login_msg, user_id_state, session_id_state, tabs]
        )
        signup_btn.click(
            fn=handle_signup,
            inputs=[signup_email, signup_password, signup_username],
            outputs=[signup_msg]
        )
        send_btn.click(
            fn=handle_send,
            inputs=[question_input, file_input, chatbot, user_id_state, session_id_state],
            outputs=[chatbot, question_input, file_input]
        )
        question_input.submit(
            fn=handle_send,
            inputs=[question_input, file_input, chatbot, user_id_state, session_id_state],
            outputs=[chatbot, question_input, file_input]
        )
        logout_btn.click(
            fn=handle_logout,
            outputs=[chatbot, login_msg, user_id_state, session_id_state, tabs]
        )
        new_session_btn.click(
            fn=handle_new_session,
            inputs=[user_id_state],
            outputs=[chatbot, session_id_state]
        )
        clear_btn.click(fn=lambda: [], outputs=[chatbot])
        refresh_sessions_btn.click(
            fn=handle_refresh_sessions,
            inputs=[user_id_state],
            outputs=[sessions_display, session_dropdown]
        )
        load_session_btn.click(
            fn=handle_load_session,
            inputs=[user_id_state, session_dropdown],
            outputs=[chatbot, session_id_state, login_msg]
        )

    return demo


if __name__ == "__main__":
    demo = create_interface()
    demo.launch()