Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- app.py +75 -125
- requirements.txt +4 -5
app.py
CHANGED
|
@@ -1,142 +1,92 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
-
import numpy as np
|
| 4 |
import faiss
|
|
|
|
| 5 |
import pickle
|
| 6 |
-
import
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from transformers import pipeline
|
|
|
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
os.makedirs(INDEX_DIR, exist_ok=True)
|
| 17 |
-
|
| 18 |
-
# === Load sentence transformer and LLM pipeline ===
|
| 19 |
-
model = SentenceTransformer(MODEL_NAME)
|
| 20 |
-
llm = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2", tokenizer="mistralai/Mistral-7B-Instruct-v0.2", device=0 if torch.cuda.is_available() else -1)
|
| 21 |
-
|
| 22 |
-
# === Helper functions ===
|
| 23 |
-
def load_pdf_by_sections(file_path):
|
| 24 |
-
doc = fitz.open(file_path)
|
| 25 |
-
chunks = []
|
| 26 |
-
for page in doc:
|
| 27 |
-
blocks = page.get_text("blocks")
|
| 28 |
-
blocks.sort() # sort top-to-bottom
|
| 29 |
-
text = ""
|
| 30 |
-
for b in blocks:
|
| 31 |
-
content = b[4].strip()
|
| 32 |
-
if len(content.split()) < 5:
|
| 33 |
-
continue
|
| 34 |
-
if content.isupper() or content.istitle(): # crude heading detection
|
| 35 |
-
if text:
|
| 36 |
-
chunks.append(text.strip())
|
| 37 |
-
text = content
|
| 38 |
-
else:
|
| 39 |
-
text += " " + content
|
| 40 |
-
if text:
|
| 41 |
-
chunks.append(text.strip())
|
| 42 |
-
return chunks
|
| 43 |
-
|
| 44 |
-
def create_index(chunks):
|
| 45 |
-
embeddings = model.encode(chunks)
|
| 46 |
-
index = faiss.IndexFlatL2(embeddings.shape[1])
|
| 47 |
-
index.add(np.array(embeddings))
|
| 48 |
-
return index, embeddings
|
| 49 |
-
|
| 50 |
-
def save_index(index, embeddings, chunks):
|
| 51 |
-
faiss.write_index(index, os.path.join(INDEX_DIR, "faiss.index"))
|
| 52 |
-
with open(os.path.join(INDEX_DIR, "chunks.pkl"), "wb") as f:
|
| 53 |
-
pickle.dump(chunks, f)
|
| 54 |
-
with open(os.path.join(INDEX_DIR, "embeddings.npy"), "wb") as f:
|
| 55 |
-
np.save(f, embeddings)
|
| 56 |
-
|
| 57 |
-
def load_index():
|
| 58 |
-
index_path = os.path.join(INDEX_DIR, "faiss.index")
|
| 59 |
-
if os.path.exists(index_path):
|
| 60 |
-
index = faiss.read_index(index_path)
|
| 61 |
-
with open(os.path.join(INDEX_DIR, "chunks.pkl"), "rb") as f:
|
| 62 |
-
chunks = pickle.load(f)
|
| 63 |
-
embeddings = np.load(os.path.join(INDEX_DIR, "embeddings.npy"))
|
| 64 |
-
return index, embeddings, chunks
|
| 65 |
-
else:
|
| 66 |
-
return None, None, []
|
| 67 |
|
| 68 |
-
def
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
return
|
| 72 |
|
| 73 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
query_emb = model.encode([query])
|
| 75 |
distances, indices = index.search(np.array(query_emb), top_k)
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
for i in sample_indices:
|
| 84 |
-
context = chunks[i]
|
| 85 |
-
prompt = f"Based only on the text below, generate a single clear quiz question:\n\n{context}\n\nQuestion:"
|
| 86 |
-
result = llm(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)[0]["generated_text"]
|
| 87 |
-
extracted = result.split("Question:")[-1].strip()
|
| 88 |
-
questions.append(extracted)
|
| 89 |
-
return "\n\n".join(questions)
|
| 90 |
-
|
| 91 |
-
# === Gradio Interface ===
|
| 92 |
-
state = {"index": None, "embeddings": None, "chunks": []}
|
| 93 |
-
|
| 94 |
-
def process_pdfs(files):
|
| 95 |
-
all_chunks = []
|
| 96 |
-
for file in files:
|
| 97 |
-
chunks = load_pdf_by_sections(file.name)
|
| 98 |
-
all_chunks.extend(chunks)
|
| 99 |
-
if all_chunks:
|
| 100 |
-
index, embeddings = create_index(all_chunks)
|
| 101 |
-
save_index(index, embeddings, all_chunks)
|
| 102 |
-
state.update({"index": index, "embeddings": embeddings, "chunks": all_chunks})
|
| 103 |
-
return f"Processed {len(files)} file(s) successfully."
|
| 104 |
-
else:
|
| 105 |
-
return "No usable text found in PDFs."
|
| 106 |
-
|
| 107 |
-
def query_bot(query):
|
| 108 |
-
if not state["index"]:
|
| 109 |
-
index, embeddings, chunks = load_index()
|
| 110 |
-
if index:
|
| 111 |
-
state.update({"index": index, "embeddings": embeddings, "chunks": chunks})
|
| 112 |
-
else:
|
| 113 |
-
return "Please upload and process PDFs first."
|
| 114 |
-
return ask_question(query, state["index"], state["embeddings"], state["chunks"])
|
| 115 |
-
|
| 116 |
-
def quiz_bot(dummy=""):
|
| 117 |
-
if not state["index"]:
|
| 118 |
-
index, embeddings, chunks = load_index()
|
| 119 |
-
if index:
|
| 120 |
-
state.update({"index": index, "embeddings": embeddings, "chunks": chunks})
|
| 121 |
-
else:
|
| 122 |
-
return "Please upload and process PDFs first."
|
| 123 |
-
return quiz_me(state["index"], state["embeddings"], state["chunks"])
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
with gr.Blocks() as demo:
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
| 127 |
with gr.Row():
|
| 128 |
-
file_input = gr.File(
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
-
gr.
|
| 133 |
-
|
| 134 |
-
answer = gr.Textbox(label="Answer")
|
| 135 |
-
question.submit(query_bot, inputs=question, outputs=answer)
|
| 136 |
|
| 137 |
-
|
| 138 |
-
quiz_btn = gr.Button("Generate 2 Quiz Questions")
|
| 139 |
-
quiz_output = gr.Textbox(label="Quiz Questions")
|
| 140 |
-
quiz_btn.click(quiz_bot, inputs=[], outputs=quiz_output)
|
| 141 |
|
| 142 |
-
demo.launch()
|
|
|
|
| 1 |
+
|
| 2 |
import os
|
| 3 |
import fitz # PyMuPDF
|
|
|
|
| 4 |
import faiss
|
| 5 |
+
import numpy as np
|
| 6 |
import pickle
|
| 7 |
+
import torch
|
| 8 |
from sentence_transformers import SentenceTransformer
|
| 9 |
from transformers import pipeline
|
| 10 |
+
import gradio as gr
|
| 11 |
|
| 12 |
+
# Load or create FAISS index and associated data
|
| 13 |
+
INDEX_FILE = "faiss_index.bin"
|
| 14 |
+
CHUNKS_FILE = "chunks.pkl"
|
| 15 |
+
|
| 16 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 17 |
+
llm = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2", tokenizer="mistralai/Mistral-7B-Instruct-v0.2", device=-1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
def load_pdf(file):
|
| 20 |
+
doc = fitz.open(file)
|
| 21 |
+
text = "\n".join(page.get_text() for page in doc)
|
| 22 |
+
return text
|
| 23 |
|
| 24 |
+
def split_text(text, chunk_size=500):
|
| 25 |
+
words = text.split()
|
| 26 |
+
return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
|
| 27 |
+
|
| 28 |
+
def create_or_load_index(chunks):
|
| 29 |
+
if os.path.exists(INDEX_FILE) and os.path.exists(CHUNKS_FILE):
|
| 30 |
+
with open(CHUNKS_FILE, "rb") as f:
|
| 31 |
+
chunks = pickle.load(f)
|
| 32 |
+
index = faiss.read_index(INDEX_FILE)
|
| 33 |
+
else:
|
| 34 |
+
embeddings = model.encode(chunks)
|
| 35 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
| 36 |
+
index.add(np.array(embeddings))
|
| 37 |
+
faiss.write_index(index, INDEX_FILE)
|
| 38 |
+
with open(CHUNKS_FILE, "wb") as f:
|
| 39 |
+
pickle.dump(chunks, f)
|
| 40 |
+
return index, chunks
|
| 41 |
+
|
| 42 |
+
def retrieve_context(query, index, chunks, top_k=3):
|
| 43 |
query_emb = model.encode([query])
|
| 44 |
distances, indices = index.search(np.array(query_emb), top_k)
|
| 45 |
+
return "\n\n".join([chunks[i] for i in indices[0]])
|
| 46 |
+
|
| 47 |
+
def answer_question(query, index, chunks):
|
| 48 |
+
context = retrieve_context(query, index, chunks)
|
| 49 |
+
prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
|
| 50 |
+
response = llm(prompt, max_new_tokens=256, do_sample=False)
|
| 51 |
+
return response[0]["generated_text"].split("Answer:")[-1].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
def generate_quiz(index, chunks):
|
| 54 |
+
context = retrieve_context("generate quiz questions", index, chunks)
|
| 55 |
+
prompt = f"Based on the following context, generate 3 quiz questions with multiple choice answers:\n\n{context}\n\nQuestions:"
|
| 56 |
+
response = llm(prompt, max_new_tokens=512, do_sample=False)
|
| 57 |
+
return response[0]["generated_text"].split("Questions:")[-1].strip()
|
| 58 |
+
|
| 59 |
+
# Gradio UI
|
| 60 |
with gr.Blocks() as demo:
|
| 61 |
+
state = {"index": None, "chunks": []}
|
| 62 |
+
|
| 63 |
+
gr.Markdown("# 📘 AI Revision Assistant")
|
| 64 |
+
|
| 65 |
with gr.Row():
|
| 66 |
+
file_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload your revision PDFs")
|
| 67 |
+
status_output = gr.Textbox(label="Status", interactive=False)
|
| 68 |
+
|
| 69 |
+
def process(files):
|
| 70 |
+
all_chunks = []
|
| 71 |
+
for file in files:
|
| 72 |
+
text = load_pdf(file.name)
|
| 73 |
+
chunks = split_text(text)
|
| 74 |
+
all_chunks.extend(chunks)
|
| 75 |
+
index, chunks = create_or_load_index(all_chunks)
|
| 76 |
+
state["index"] = index
|
| 77 |
+
state["chunks"] = chunks
|
| 78 |
+
return f"Processed {len(files)} files. You can now ask questions or generate quizzes."
|
| 79 |
+
|
| 80 |
+
file_input.change(fn=process, inputs=file_input, outputs=status_output)
|
| 81 |
+
|
| 82 |
+
question_input = gr.Textbox(label="Ask a revision question")
|
| 83 |
+
answer_output = gr.Textbox(label="Answer", lines=5)
|
| 84 |
+
|
| 85 |
+
question_input.submit(fn=lambda q: answer_question(q, state["index"], state["chunks"]) if state["index"] else "Please upload files first.", inputs=question_input, outputs=answer_output)
|
| 86 |
|
| 87 |
+
quiz_btn = gr.Button("Quiz Me")
|
| 88 |
+
quiz_output = gr.Textbox(label="Generated Quiz Questions", lines=6)
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
quiz_btn.click(fn=lambda: generate_quiz(state["index"], state["chunks"]) if state["index"] else "Please upload files first.", outputs=quiz_output)
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
demo.launch(debug=True)
|
requirements.txt
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
| 2 |
sentence-transformers
|
| 3 |
faiss-cpu
|
|
|
|
| 4 |
gradio
|
| 5 |
-
transformers
|
| 6 |
-
torch
|
| 7 |
-
accelerate
|
| 8 |
-
safetensors
|
|
|
|
| 1 |
+
|
| 2 |
+
torch
|
| 3 |
+
transformers
|
| 4 |
sentence-transformers
|
| 5 |
faiss-cpu
|
| 6 |
+
PyMuPDF
|
| 7 |
gradio
|
|
|
|
|
|
|
|
|
|
|
|