|
|
import gradio as gr |
|
|
import PyPDF2 |
|
|
import re |
|
|
import numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import faiss |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embed_model = SentenceTransformer("all-mpnet-base-v2") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vector_store = None |
|
|
chunks_store = None |
|
|
embeddings_store = None |
|
|
TOP_K = 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_pdf(file): |
|
|
pdf_reader = PyPDF2.PdfReader(file) |
|
|
text_pages = [page.extract_text() for page in pdf_reader.pages] |
|
|
return text_pages |
|
|
|
|
|
def chunk_text(text_pages, chunk_size=200, overlap=50): |
|
|
chunks = [] |
|
|
for page in text_pages: |
|
|
if not page: |
|
|
continue |
|
|
words = re.split(r'\s+', page) |
|
|
start = 0 |
|
|
while start < len(words): |
|
|
end = start + chunk_size |
|
|
chunks.append(" ".join(words[start:end])) |
|
|
start += chunk_size - overlap |
|
|
return chunks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def vectorize_pdf(marking_scheme_file): |
|
|
global vector_store, chunks_store, embeddings_store |
|
|
|
|
|
pages = load_pdf(marking_scheme_file) |
|
|
chunks = chunk_text(pages) |
|
|
embeddings = embed_model.encode(chunks, convert_to_numpy=True) |
|
|
|
|
|
vector_store = faiss.IndexFlatL2(embeddings.shape[1]) |
|
|
vector_store.add(embeddings) |
|
|
|
|
|
chunks_store = chunks |
|
|
embeddings_store = embeddings |
|
|
|
|
|
|
|
|
table_preview = [] |
|
|
for i, chunk in enumerate(chunks[:10]): |
|
|
table_preview.append({ |
|
|
"chunk_id": i + 1, |
|
|
"text_preview": chunk[:50].replace("\n"," ") + ("..." if len(chunk) > 50 else ""), |
|
|
"embedding_preview": np.round(embeddings[i][:5], 4).tolist() |
|
|
}) |
|
|
|
|
|
return { |
|
|
"num_chunks": len(chunks), |
|
|
"preview": table_preview |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_student_pdf_qna(student_pdf_file): |
|
|
""" |
|
|
Parses a PDF where each answer is in format: |
|
|
Question: <text> |
|
|
Answer: <text> |
|
|
Returns a list of (question, answer) tuples. |
|
|
""" |
|
|
pages = load_pdf(student_pdf_file) |
|
|
text = "\n".join(pages) |
|
|
|
|
|
|
|
|
pattern = re.compile(r"Question:\s*(.+?)\s*Answer:\s*(.+?)(?=Question:|$)", re.DOTALL | re.IGNORECASE) |
|
|
qas = pattern.findall(text) |
|
|
|
|
|
|
|
|
qas = [(q.strip(), a.strip()) for q, a in qas] |
|
|
return qas |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_enhanced_prompts(student_pdf_file, top_k=TOP_K, max_marks=4): |
|
|
global vector_store, chunks_store, embeddings_store |
|
|
if vector_store is None or chunks_store is None: |
|
|
return "Error: No marking scheme vector store loaded. Please upload PDF first." |
|
|
|
|
|
qas = parse_student_pdf_qna(student_pdf_file) |
|
|
prompts = {} |
|
|
|
|
|
for question, answer_text in qas: |
|
|
|
|
|
query_vec = embed_model.encode([answer_text], convert_to_numpy=True) |
|
|
|
|
|
|
|
|
distances, indices = vector_store.search(query_vec, top_k) |
|
|
retrieved_chunks = [chunks_store[i] for i in indices[0]] |
|
|
|
|
|
|
|
|
prompt = f"""Instruction: You are a national exam marker. Compare the student's answer with the marking scheme and award marks according to the scheme. Provide rationale. Award partial marks if some points are covered. Output in JSON. |
|
|
|
|
|
Question: {question} |
|
|
Answer: {answer_text} |
|
|
Marking Scheme Context: {' '.join(retrieved_chunks)} |
|
|
Maximum Marks: {max_marks} |
|
|
Guidelines: If answer contains part of correct points, award partial marks proportionally. |
|
|
Output Format: |
|
|
{{ |
|
|
"score": <numeric>, |
|
|
"rationale": "<explanation>" |
|
|
}} |
|
|
""" |
|
|
prompts[question] = prompt |
|
|
|
|
|
return prompts |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("## Vectorization + Retrieval + Enhanced Prompt Generation") |
|
|
|
|
|
|
|
|
pdf_file = gr.File(label="Upload Marking Scheme PDF") |
|
|
vector_output = gr.JSON(label="Vectorization Info") |
|
|
submit_vector = gr.Button("Vectorize PDF") |
|
|
submit_vector.click(vectorize_pdf, inputs=[pdf_file], outputs=[vector_output]) |
|
|
|
|
|
|
|
|
student_pdf = gr.File(label="Upload Student Answer PDF") |
|
|
prompts_output = gr.JSON(label="Generated Prompts for Marking") |
|
|
submit_prompts = gr.Button("Generate Enhanced Prompts") |
|
|
submit_prompts.click(generate_enhanced_prompts, inputs=[student_pdf], outputs=[prompts_output]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) |
|
|
|