Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import faiss | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import groq | |
| import traceback | |
| # π Set your GROQ API Key as a HF Space secret (recommended) | |
| GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "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") # or set here temporarily | |
| groq_client = groq.Groq(api_key=GROQ_API_KEY) | |
| # ========================== | |
| # π§ Prompt Templates | |
| # ========================== | |
| SYSTEM_TEMPLATE = "You are an expert academic supervisor helping students understand academic papers. Be concise, clear, and encouraging." | |
| USER_TEMPLATE = "Based on the following context, answer the student's question.\n\nContext:\n{context}\n\nQuestion:\n{question}" | |
| # ========================== | |
| # π§ Embedding Model | |
| # ========================== | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| # ========================== | |
| # π PDF Text Extraction | |
| # ========================== | |
| def extract_text_from_pdf(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| text = "\n".join([page.get_text() for page in doc]) | |
| return text | |
| def chunk_text(text, chunk_size=500, overlap=100): | |
| words = text.split() | |
| chunks = [] | |
| for i in range(0, len(words), chunk_size - overlap): | |
| chunk = " ".join(words[i:i + chunk_size]) | |
| chunks.append(chunk) | |
| return chunks | |
| def create_vector_store(chunks): | |
| embeddings = embedder.encode(chunks) | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) | |
| index.add(np.array(embeddings)) | |
| return index, chunks, embeddings | |
| def retrieve_relevant_chunks(question, index, chunks, embeddings, k=5): | |
| question_embedding = embedder.encode([question]) | |
| D, I = index.search(np.array(question_embedding), k) | |
| return "\n\n".join([chunks[i] for i in I[0]]) | |
| def call_llama3(system, user): | |
| response = groq_client.chat.completions.create( | |
| messages=[ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user} | |
| ], | |
| model="llama3-8b-8192" | |
| ) | |
| return response.choices[0].message.content | |
| # ========================== | |
| # π Gradio App | |
| # ========================== | |
| vector_index = None | |
| stored_chunks = None | |
| stored_embeddings = None | |
| def process_pdf(file): | |
| global vector_index, stored_chunks, stored_embeddings | |
| try: | |
| if isinstance(file, str): | |
| file_path = file | |
| elif hasattr(file, "name"): | |
| file_path = file.name | |
| else: | |
| return "β Error: Unsupported file format." | |
| text = extract_text_from_pdf(file_path) | |
| if not text.strip(): | |
| return "β Error: No text found in the PDF. It might be image-based or encrypted." | |
| chunks = chunk_text(text) | |
| if len(chunks) == 0: | |
| return "β Error: Could not generate chunks from text." | |
| vector_index, stored_chunks, stored_embeddings = create_vector_store(chunks) | |
| return f"β Successfully processed the document with {len(chunks)} chunks." | |
| except Exception as e: | |
| return f"β Failed to process PDF:\n{str(e)}\n\n{traceback.format_exc()}" | |
| def answer_question(question): | |
| if not vector_index: | |
| return "β οΈ Please upload and process a PDF first." | |
| context = retrieve_relevant_chunks(question, vector_index, stored_chunks, stored_embeddings) | |
| prompt = USER_TEMPLATE.format(context=context, question=question) | |
| return call_llama3(SYSTEM_TEMPLATE, prompt) | |
| with gr.Blocks() as app: | |
| gr.Markdown("# π RAG Paper Supervisor (LLaMA 3 via Groq)") | |
| gr.Markdown("Upload an academic PDF and ask questions β powered by LLaMA 3 and semantic search.") | |
| with gr.Row(): | |
| pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| upload_btn = gr.Button("Process Document") | |
| upload_output = gr.Textbox(label="Status", interactive=False) | |
| with gr.Row(): | |
| question = gr.Textbox(label="Ask a question about the paper") | |
| ask_btn = gr.Button("Get Answer") | |
| answer = gr.Textbox(label="Answer", lines=6) | |
| upload_btn.click(process_pdf, inputs=pdf_upload, outputs=upload_output) | |
| ask_btn.click(answer_question, inputs=question, outputs=answer) | |
| app.launch() | |