File size: 4,902 Bytes
6a34abf d60bf93 6a34abf d60bf93 6a34abf d60bf93 6a34abf d60bf93 6a34abf d60bf93 6a34abf d60bf93 6a34abf d60bf93 6a34abf d60bf93 6a34abf 01d723b 6a34abf 01d723b d60bf93 6a34abf d60bf93 6a34abf d60bf93 01d723b d60bf93 6a34abf 01d723b d60bf93 01d723b 6a34abf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import gradio as gr
import PyPDF2
import re
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
# ----------------------------
# Embedding model
# ----------------------------
embed_model = SentenceTransformer("all-mpnet-base-v2")
# ----------------------------
# In-memory storage
# ----------------------------
vector_store = None
chunks_store = None
embeddings_store = None
TOP_K = 3 # number of chunks to retrieve
# ----------------------------
# PDF Loader and Chunker
# ----------------------------
def load_pdf(file):
pdf_reader = PyPDF2.PdfReader(file)
text_pages = [page.extract_text() for page in pdf_reader.pages]
return text_pages
def chunk_text(text_pages, chunk_size=200, overlap=50):
chunks = []
for page in text_pages:
if not page:
continue
words = re.split(r'\s+', page)
start = 0
while start < len(words):
end = start + chunk_size
chunks.append(" ".join(words[start:end]))
start += chunk_size - overlap
return chunks
# ----------------------------
# Vectorization
# ----------------------------
def vectorize_pdf(marking_scheme_file):
global vector_store, chunks_store, embeddings_store
pages = load_pdf(marking_scheme_file)
chunks = chunk_text(pages)
embeddings = embed_model.encode(chunks, convert_to_numpy=True)
vector_store = faiss.IndexFlatL2(embeddings.shape[1])
vector_store.add(embeddings)
chunks_store = chunks
embeddings_store = embeddings
# Preview table
table_preview = []
for i, chunk in enumerate(chunks[:10]):
table_preview.append({
"chunk_id": i + 1,
"text_preview": chunk[:50].replace("\n"," ") + ("..." if len(chunk) > 50 else ""),
"embedding_preview": np.round(embeddings[i][:5], 4).tolist()
})
return {
"num_chunks": len(chunks),
"preview": table_preview
}
# ----------------------------
# Parse student PDF (Question + Answer)
# ----------------------------
def parse_student_pdf_qna(student_pdf_file):
"""
Parses a PDF where each answer is in format:
Question: <text>
Answer: <text>
Returns a list of (question, answer) tuples.
"""
pages = load_pdf(student_pdf_file)
text = "\n".join(pages)
# Regex to match Question: ... Answer: ...
pattern = re.compile(r"Question:\s*(.+?)\s*Answer:\s*(.+?)(?=Question:|$)", re.DOTALL | re.IGNORECASE)
qas = pattern.findall(text)
# Strip extra spaces
qas = [(q.strip(), a.strip()) for q, a in qas]
return qas
# ----------------------------
# Retrieve relevant chunks and generate enhanced prompt
# ----------------------------
def generate_enhanced_prompts(student_pdf_file, top_k=TOP_K, max_marks=4):
global vector_store, chunks_store, embeddings_store
if vector_store is None or chunks_store is None:
return "Error: No marking scheme vector store loaded. Please upload PDF first."
qas = parse_student_pdf_qna(student_pdf_file)
prompts = {}
for question, answer_text in qas:
# Embed student answer
query_vec = embed_model.encode([answer_text], convert_to_numpy=True)
# Search FAISS
distances, indices = vector_store.search(query_vec, top_k)
retrieved_chunks = [chunks_store[i] for i in indices[0]]
# Create enhanced prompt
prompt = f"""Instruction: You are a national exam marker. Compare the student's answer with the marking scheme and award marks according to the scheme. Provide rationale. Award partial marks if some points are covered. Output in JSON.
Question: {question}
Answer: {answer_text}
Marking Scheme Context: {' '.join(retrieved_chunks)}
Maximum Marks: {max_marks}
Guidelines: If answer contains part of correct points, award partial marks proportionally.
Output Format:
{{
"score": <numeric>,
"rationale": "<explanation>"
}}
"""
prompts[question] = prompt
return prompts
# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks() as demo:
gr.Markdown("## Vectorization + Retrieval + Enhanced Prompt Generation")
# Upload marking scheme PDF
pdf_file = gr.File(label="Upload Marking Scheme PDF")
vector_output = gr.JSON(label="Vectorization Info")
submit_vector = gr.Button("Vectorize PDF")
submit_vector.click(vectorize_pdf, inputs=[pdf_file], outputs=[vector_output])
# Upload student answer PDF
student_pdf = gr.File(label="Upload Student Answer PDF")
prompts_output = gr.JSON(label="Generated Prompts for Marking")
submit_prompts = gr.Button("Generate Enhanced Prompts")
submit_prompts.click(generate_enhanced_prompts, inputs=[student_pdf], outputs=[prompts_output])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
|