File size: 4,210 Bytes
c0979a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import gradio as gr
import fitz  # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import groq
import traceback

# πŸ” Set your GROQ API Key as a HF Space secret (recommended)
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "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")  # or set here temporarily
groq_client = groq.Groq(api_key=GROQ_API_KEY)

# ==========================
# πŸ”§ Prompt Templates
# ==========================
SYSTEM_TEMPLATE = "You are an expert academic supervisor helping students understand academic papers. Be concise, clear, and encouraging."
USER_TEMPLATE = "Based on the following context, answer the student's question.\n\nContext:\n{context}\n\nQuestion:\n{question}"

# ==========================
# 🧠 Embedding Model
# ==========================
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# ==========================
# πŸ“„ PDF Text Extraction
# ==========================
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])
    return text

def chunk_text(text, chunk_size=500, overlap=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def create_vector_store(chunks):
    embeddings = embedder.encode(chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    return index, chunks, embeddings

def retrieve_relevant_chunks(question, index, chunks, embeddings, k=5):
    question_embedding = embedder.encode([question])
    D, I = index.search(np.array(question_embedding), k)
    return "\n\n".join([chunks[i] for i in I[0]])

def call_llama3(system, user):
    response = groq_client.chat.completions.create(
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user}
        ],
        model="llama3-8b-8192"
    )
    return response.choices[0].message.content

# ==========================
# 🌐 Gradio App
# ==========================

vector_index = None
stored_chunks = None
stored_embeddings = None

def process_pdf(file):
    global vector_index, stored_chunks, stored_embeddings
    try:
        if isinstance(file, str):
            file_path = file
        elif hasattr(file, "name"):
            file_path = file.name
        else:
            return "❌ Error: Unsupported file format."

        text = extract_text_from_pdf(file_path)
        if not text.strip():
            return "❌ Error: No text found in the PDF. It might be image-based or encrypted."

        chunks = chunk_text(text)
        if len(chunks) == 0:
            return "❌ Error: Could not generate chunks from text."

        vector_index, stored_chunks, stored_embeddings = create_vector_store(chunks)
        return f"βœ… Successfully processed the document with {len(chunks)} chunks."

    except Exception as e:
        return f"❌ Failed to process PDF:\n{str(e)}\n\n{traceback.format_exc()}"

def answer_question(question):
    if not vector_index:
        return "⚠️ Please upload and process a PDF first."

    context = retrieve_relevant_chunks(question, vector_index, stored_chunks, stored_embeddings)
    prompt = USER_TEMPLATE.format(context=context, question=question)
    return call_llama3(SYSTEM_TEMPLATE, prompt)

with gr.Blocks() as app:
    gr.Markdown("# πŸ“š RAG Paper Supervisor (LLaMA 3 via Groq)")
    gr.Markdown("Upload an academic PDF and ask questions β€” powered by LLaMA 3 and semantic search.")

    with gr.Row():
        pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
        upload_btn = gr.Button("Process Document")
        upload_output = gr.Textbox(label="Status", interactive=False)

    with gr.Row():
        question = gr.Textbox(label="Ask a question about the paper")
        ask_btn = gr.Button("Get Answer")
        answer = gr.Textbox(label="Answer", lines=6)

    upload_btn.click(process_pdf, inputs=pdf_upload, outputs=upload_output)
    ask_btn.click(answer_question, inputs=question, outputs=answer)

app.launch()