traleela commited on
Commit
473affa
·
verified ·
1 Parent(s): fe0e877

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +75 -125
  2. requirements.txt +4 -5
app.py CHANGED
@@ -1,142 +1,92 @@
 
1
  import os
2
  import fitz # PyMuPDF
3
- import numpy as np
4
  import faiss
 
5
  import pickle
6
- import gradio as gr
7
  from sentence_transformers import SentenceTransformer
8
  from transformers import pipeline
 
9
 
10
- # === Configuration ===
11
- INDEX_DIR = "saved_index"
12
- MODEL_NAME = "all-MiniLM-L6-v2"
13
- CHUNK_SIZE = 500
14
-
15
- # === Ensure save directory exists ===
16
- os.makedirs(INDEX_DIR, exist_ok=True)
17
-
18
- # === Load sentence transformer and LLM pipeline ===
19
- model = SentenceTransformer(MODEL_NAME)
20
- llm = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2", tokenizer="mistralai/Mistral-7B-Instruct-v0.2", device=0 if torch.cuda.is_available() else -1)
21
-
22
- # === Helper functions ===
23
- def load_pdf_by_sections(file_path):
24
- doc = fitz.open(file_path)
25
- chunks = []
26
- for page in doc:
27
- blocks = page.get_text("blocks")
28
- blocks.sort() # sort top-to-bottom
29
- text = ""
30
- for b in blocks:
31
- content = b[4].strip()
32
- if len(content.split()) < 5:
33
- continue
34
- if content.isupper() or content.istitle(): # crude heading detection
35
- if text:
36
- chunks.append(text.strip())
37
- text = content
38
- else:
39
- text += " " + content
40
- if text:
41
- chunks.append(text.strip())
42
- return chunks
43
-
44
- def create_index(chunks):
45
- embeddings = model.encode(chunks)
46
- index = faiss.IndexFlatL2(embeddings.shape[1])
47
- index.add(np.array(embeddings))
48
- return index, embeddings
49
-
50
- def save_index(index, embeddings, chunks):
51
- faiss.write_index(index, os.path.join(INDEX_DIR, "faiss.index"))
52
- with open(os.path.join(INDEX_DIR, "chunks.pkl"), "wb") as f:
53
- pickle.dump(chunks, f)
54
- with open(os.path.join(INDEX_DIR, "embeddings.npy"), "wb") as f:
55
- np.save(f, embeddings)
56
-
57
- def load_index():
58
- index_path = os.path.join(INDEX_DIR, "faiss.index")
59
- if os.path.exists(index_path):
60
- index = faiss.read_index(index_path)
61
- with open(os.path.join(INDEX_DIR, "chunks.pkl"), "rb") as f:
62
- chunks = pickle.load(f)
63
- embeddings = np.load(os.path.join(INDEX_DIR, "embeddings.npy"))
64
- return index, embeddings, chunks
65
- else:
66
- return None, None, []
67
 
68
- def generate_answer(context, question):
69
- prompt = f"Answer this based only on the context below. Be precise and relevant.\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
70
- response = llm(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)[0]["generated_text"]
71
- return response.split("Answer:")[-1].strip()
72
 
73
- def ask_question(query, index, embeddings, chunks, top_k=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  query_emb = model.encode([query])
75
  distances, indices = index.search(np.array(query_emb), top_k)
76
- relevant_context = "\n\n".join([chunks[i] for i in indices[0]])
77
- return generate_answer(relevant_context, query)
78
-
79
- def quiz_me(index, embeddings, chunks):
80
- np.random.seed(42)
81
- sample_indices = np.random.choice(len(chunks), 2, replace=False)
82
- questions = []
83
- for i in sample_indices:
84
- context = chunks[i]
85
- prompt = f"Based only on the text below, generate a single clear quiz question:\n\n{context}\n\nQuestion:"
86
- result = llm(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)[0]["generated_text"]
87
- extracted = result.split("Question:")[-1].strip()
88
- questions.append(extracted)
89
- return "\n\n".join(questions)
90
-
91
- # === Gradio Interface ===
92
- state = {"index": None, "embeddings": None, "chunks": []}
93
-
94
- def process_pdfs(files):
95
- all_chunks = []
96
- for file in files:
97
- chunks = load_pdf_by_sections(file.name)
98
- all_chunks.extend(chunks)
99
- if all_chunks:
100
- index, embeddings = create_index(all_chunks)
101
- save_index(index, embeddings, all_chunks)
102
- state.update({"index": index, "embeddings": embeddings, "chunks": all_chunks})
103
- return f"Processed {len(files)} file(s) successfully."
104
- else:
105
- return "No usable text found in PDFs."
106
-
107
- def query_bot(query):
108
- if not state["index"]:
109
- index, embeddings, chunks = load_index()
110
- if index:
111
- state.update({"index": index, "embeddings": embeddings, "chunks": chunks})
112
- else:
113
- return "Please upload and process PDFs first."
114
- return ask_question(query, state["index"], state["embeddings"], state["chunks"])
115
-
116
- def quiz_bot(dummy=""):
117
- if not state["index"]:
118
- index, embeddings, chunks = load_index()
119
- if index:
120
- state.update({"index": index, "embeddings": embeddings, "chunks": chunks})
121
- else:
122
- return "Please upload and process PDFs first."
123
- return quiz_me(state["index"], state["embeddings"], state["chunks"])
124
 
 
 
 
 
 
 
 
125
  with gr.Blocks() as demo:
126
- gr.Markdown("# 🎓 AI Study Assistant (PDF-based with Smart Q&A + Quizzing)")
 
 
 
127
  with gr.Row():
128
- file_input = gr.File(label="Upload PDFs", file_count="multiple")
129
- status = gr.Textbox(label="Status Message")
130
- file_input.change(process_pdfs, inputs=file_input, outputs=status)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- gr.Markdown("## ❓ Ask a Question from Your Material")
133
- question = gr.Textbox(label="Question")
134
- answer = gr.Textbox(label="Answer")
135
- question.submit(query_bot, inputs=question, outputs=answer)
136
 
137
- gr.Markdown("## 🧠 Quiz Me from Uploaded PDFs")
138
- quiz_btn = gr.Button("Generate 2 Quiz Questions")
139
- quiz_output = gr.Textbox(label="Quiz Questions")
140
- quiz_btn.click(quiz_bot, inputs=[], outputs=quiz_output)
141
 
142
- demo.launch()
 
1
+
2
  import os
3
  import fitz # PyMuPDF
 
4
  import faiss
5
+ import numpy as np
6
  import pickle
7
+ import torch
8
  from sentence_transformers import SentenceTransformer
9
  from transformers import pipeline
10
+ import gradio as gr
11
 
12
+ # Load or create FAISS index and associated data
13
+ INDEX_FILE = "faiss_index.bin"
14
+ CHUNKS_FILE = "chunks.pkl"
15
+
16
+ model = SentenceTransformer("all-MiniLM-L6-v2")
17
+ llm = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2", tokenizer="mistralai/Mistral-7B-Instruct-v0.2", device=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ def load_pdf(file):
20
+ doc = fitz.open(file)
21
+ text = "\n".join(page.get_text() for page in doc)
22
+ return text
23
 
24
+ def split_text(text, chunk_size=500):
25
+ words = text.split()
26
+ return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
27
+
28
+ def create_or_load_index(chunks):
29
+ if os.path.exists(INDEX_FILE) and os.path.exists(CHUNKS_FILE):
30
+ with open(CHUNKS_FILE, "rb") as f:
31
+ chunks = pickle.load(f)
32
+ index = faiss.read_index(INDEX_FILE)
33
+ else:
34
+ embeddings = model.encode(chunks)
35
+ index = faiss.IndexFlatL2(embeddings.shape[1])
36
+ index.add(np.array(embeddings))
37
+ faiss.write_index(index, INDEX_FILE)
38
+ with open(CHUNKS_FILE, "wb") as f:
39
+ pickle.dump(chunks, f)
40
+ return index, chunks
41
+
42
+ def retrieve_context(query, index, chunks, top_k=3):
43
  query_emb = model.encode([query])
44
  distances, indices = index.search(np.array(query_emb), top_k)
45
+ return "\n\n".join([chunks[i] for i in indices[0]])
46
+
47
+ def answer_question(query, index, chunks):
48
+ context = retrieve_context(query, index, chunks)
49
+ prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
50
+ response = llm(prompt, max_new_tokens=256, do_sample=False)
51
+ return response[0]["generated_text"].split("Answer:")[-1].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ def generate_quiz(index, chunks):
54
+ context = retrieve_context("generate quiz questions", index, chunks)
55
+ prompt = f"Based on the following context, generate 3 quiz questions with multiple choice answers:\n\n{context}\n\nQuestions:"
56
+ response = llm(prompt, max_new_tokens=512, do_sample=False)
57
+ return response[0]["generated_text"].split("Questions:")[-1].strip()
58
+
59
+ # Gradio UI
60
  with gr.Blocks() as demo:
61
+ state = {"index": None, "chunks": []}
62
+
63
+ gr.Markdown("# 📘 AI Revision Assistant")
64
+
65
  with gr.Row():
66
+ file_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload your revision PDFs")
67
+ status_output = gr.Textbox(label="Status", interactive=False)
68
+
69
+ def process(files):
70
+ all_chunks = []
71
+ for file in files:
72
+ text = load_pdf(file.name)
73
+ chunks = split_text(text)
74
+ all_chunks.extend(chunks)
75
+ index, chunks = create_or_load_index(all_chunks)
76
+ state["index"] = index
77
+ state["chunks"] = chunks
78
+ return f"Processed {len(files)} files. You can now ask questions or generate quizzes."
79
+
80
+ file_input.change(fn=process, inputs=file_input, outputs=status_output)
81
+
82
+ question_input = gr.Textbox(label="Ask a revision question")
83
+ answer_output = gr.Textbox(label="Answer", lines=5)
84
+
85
+ question_input.submit(fn=lambda q: answer_question(q, state["index"], state["chunks"]) if state["index"] else "Please upload files first.", inputs=question_input, outputs=answer_output)
86
 
87
+ quiz_btn = gr.Button("Quiz Me")
88
+ quiz_output = gr.Textbox(label="Generated Quiz Questions", lines=6)
 
 
89
 
90
+ quiz_btn.click(fn=lambda: generate_quiz(state["index"], state["chunks"]) if state["index"] else "Please upload files first.", outputs=quiz_output)
 
 
 
91
 
92
+ demo.launch(debug=True)
requirements.txt CHANGED
@@ -1,8 +1,7 @@
1
- pymupdf
 
 
2
  sentence-transformers
3
  faiss-cpu
 
4
  gradio
5
- transformers
6
- torch
7
- accelerate
8
- safetensors
 
1
+
2
+ torch
3
+ transformers
4
  sentence-transformers
5
  faiss-cpu
6
+ PyMuPDF
7
  gradio