Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import faiss | |
| import numpy as np | |
| from PyPDF2 import PdfReader | |
| from sentence_transformers import SentenceTransformer | |
| from groq import Groq | |
| # Load Groq API Key from environment variable | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| if not GROQ_API_KEY: | |
| raise ValueError("Please set the GROQ_API_KEY as a secret/environment variable in the Hugging Face Space.") | |
| groq_client = Groq(api_key=GROQ_API_KEY) | |
| # Load embedding model | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| # Templates | |
| SYSTEM_TEMPLATE = ( | |
| "You are a helpful academic supervisor who helps a student understand their paper and answer questions based only on the document." | |
| ) | |
| USER_TEMPLATE = ( | |
| "Based on the provided document, answer the following question:\n\n{context}\n\nQuestion: {question}\n\nAnswer:" | |
| ) | |
| # Globals | |
| faiss_index = None | |
| stored_chunks = [] | |
| dimension = 384 # embedding size | |
| # PDF to text | |
| def extract_text_from_pdf(pdf_path): | |
| reader = PdfReader(pdf_path) | |
| text = "" | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| return text if text.strip() else None | |
| # Chunk text | |
| def chunk_text(text, chunk_size=500): | |
| sentences = text.split(". ") | |
| chunks, current_chunk = [], "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) <= chunk_size: | |
| current_chunk += sentence + ". " | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence + ". " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # Upload & index | |
| def upload_and_index(pdf): | |
| global faiss_index, stored_chunks | |
| if pdf is None: | |
| return "β No PDF uploaded. Please upload a file." | |
| try: | |
| text = extract_text_from_pdf(pdf.name) | |
| if text is None: | |
| return "β Failed to extract text. PDF may be scanned or empty." | |
| chunks = chunk_text(text) | |
| if not chunks: | |
| return "β Failed to split text into chunks." | |
| embeddings = embedder.encode(chunks) | |
| faiss_index = faiss.IndexFlatL2(dimension) | |
| stored_chunks.clear() | |
| stored_chunks.extend(chunks) | |
| faiss_index.add(np.array(embeddings, dtype=np.float32)) | |
| return "β Document indexed successfully! You can now ask questions." | |
| except Exception as e: | |
| return f"β Error during upload/indexing: {e}" | |
| # Answer question | |
| def answer_question(question): | |
| global faiss_index, stored_chunks | |
| if faiss_index is None or faiss_index.ntotal == 0: | |
| return "β Please upload & index a document first." | |
| try: | |
| q_embedding = embedder.encode([question]) | |
| distances, indices = faiss_index.search(np.array(q_embedding, dtype=np.float32), k=3) | |
| context = "\n\n".join( | |
| [stored_chunks[idx] for idx in indices[0] if idx < len(stored_chunks)] | |
| ) | |
| if not context: | |
| return "β Could not find relevant context." | |
| prompt = USER_TEMPLATE.format(context=context, question=question) | |
| chat_completion = groq_client.chat.completions.create( | |
| model="llama3-8b-8192", | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_TEMPLATE}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.2, | |
| max_tokens=512 | |
| ) | |
| return chat_completion.choices[0].message.content.strip() | |
| except Exception as e: | |
| return f"β Error while answering: {e}" | |
| # Gradio Interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π Academic Supervisor β Upload Paper & Ask Questions") | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| upload_btn = gr.Button("Upload & Index") | |
| upload_output = gr.Textbox(label="Status") | |
| with gr.Column(): | |
| question_input = gr.Textbox(label="Your Question") | |
| ask_btn = gr.Button("Get Answer") | |
| answer_output = gr.Textbox(label="Answer") | |
| upload_btn.click(upload_and_index, inputs=pdf_input, outputs=upload_output) | |
| ask_btn.click(answer_question, inputs=question_input, outputs=answer_output) | |
| demo.launch() | |