Spaces:
Sleeping
Sleeping
File size: 3,759 Bytes
e9a0d05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# app.py β Hugging Face deployable RAG Study Supervisor
import os
import gradio as gr
import fitz # PyMuPDF
import docx
import numpy as np
import faiss
import requests
from sentence_transformers import SentenceTransformer
# π Read Groq API Key from environment (set it in HF secrets)
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
MODEL = "llama3-8b-8192"
# π¬ Prompt templates
system_template = "You are a helpful and knowledgeable study supervisor. You are given excerpts from a document, and your job is to answer student questions based on that information. Be precise and explain clearly like a teacher."
user_template = """Context: {context}
Question: {question}
Answer like a teacher:"""
# π Embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
def extract_text(file):
ext = file.name.split(".")[-1].lower()
if ext == "pdf":
doc = fitz.open(file.name)
return "\n".join([page.get_text() for page in doc])
elif ext in ["docx", "doc"]:
doc = docx.Document(file.name)
return "\n".join([p.text for p in doc.paragraphs])
return ""
def chunk_text(text, size=300, overlap=50):
words = text.split()
return [" ".join(words[i:i + size]) for i in range(0, len(words), size - overlap)]
def embed_chunks(chunks):
return embedder.encode(chunks).astype("float32")
def store_faiss(chunks, vectors):
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)
return {"documents": chunks, "vectors": vectors, "index": index}
def get_context(query, state, k=3):
if state is None or "index" not in state:
return "β οΈ Please upload a document first.", None
q_vec = embedder.encode([query]).astype("float32")
_, I = state["index"].search(q_vec, k)
return [state["documents"][i] for i in I[0]], state
def query_llm(context, question):
prompt = user_template.format(context="\n".join(context), question=question)
response = requests.post(
"https://api.groq.com/openai/v1/chat/completions",
headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
json={
"model": MODEL,
"messages": [
{"role": "system", "content": system_template},
{"role": "user", "content": prompt}
],
"temperature": 0.3
}
)
if response.status_code == 200:
return response.json()["choices"][0]["message"]["content"]
else:
return f"β Error: {response.json()}"
def upload_file(file):
text = extract_text(file)
if not text.strip():
return "β οΈ File is empty or unreadable.", None
chunks = chunk_text(text)
vectors = embed_chunks(chunks)
state = store_faiss(chunks, vectors)
return "β
Document processed!", state
def ask_question(question, state):
if not question.strip():
return "β Please enter a question.", state
context, state = get_context(question, state)
if isinstance(context, str): # error message
return context, state
return query_llm(context, question), state
# ποΈ Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# π Study Supervisor Chatbot\nUpload a document and ask questions about it.")
state = gr.State()
with gr.Row():
file = gr.File(file_types=[".pdf", ".docx", ".doc"], label="π Upload Document")
upload_btn = gr.Button("π€ Upload and Process")
status = gr.Textbox(label="Status", interactive=False)
question = gr.Textbox(label="β Ask a Question")
answer = gr.Textbox(label="π Answer", lines=8)
upload_btn.click(upload_file, inputs=file, outputs=[status, state])
question.submit(ask_question, inputs=[question, state], outputs=[answer, state])
demo.launch()
|