Spaces:

ST-THOMAS-OF-AQUINAS
/

Vector_emebeding

Sleeping

File size: 4,902 Bytes

6a34abf
 
 
d60bf93
6a34abf
 
 
 
 
 
 
 
 
 
 
d60bf93
 
 
 
6a34abf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60bf93
 
 
 
 
6a34abf
d60bf93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a34abf
 
d60bf93
6a34abf
d60bf93
6a34abf
d60bf93
 
 
 
6a34abf
d60bf93
 
 
 
 
 
 
 
 
 
6a34abf
 
01d723b
6a34abf
01d723b
d60bf93
 
 
6a34abf
d60bf93
 
6a34abf
d60bf93
 
 
 
 
 
 
 
01d723b
 
 
 
 
 
 
 
 
 
 
 
 
 
d60bf93
 
 
6a34abf
 
 
 
 
01d723b
 
d60bf93
 
 
 
 
 
 
 
 
01d723b
 
6a34abf

import gradio as gr
import PyPDF2
import re
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# ----------------------------
# Embedding model
# ----------------------------
embed_model = SentenceTransformer("all-mpnet-base-v2")

# ----------------------------
# In-memory storage
# ----------------------------
vector_store = None
chunks_store = None
embeddings_store = None
TOP_K = 3  # number of chunks to retrieve

# ----------------------------
# PDF Loader and Chunker
# ----------------------------
def load_pdf(file):
    pdf_reader = PyPDF2.PdfReader(file)
    text_pages = [page.extract_text() for page in pdf_reader.pages]
    return text_pages

def chunk_text(text_pages, chunk_size=200, overlap=50):
    chunks = []
    for page in text_pages:
        if not page:
            continue
        words = re.split(r'\s+', page)
        start = 0
        while start < len(words):
            end = start + chunk_size
            chunks.append(" ".join(words[start:end]))
            start += chunk_size - overlap
    return chunks

# ----------------------------
# Vectorization
# ----------------------------
def vectorize_pdf(marking_scheme_file):
    global vector_store, chunks_store, embeddings_store

    pages = load_pdf(marking_scheme_file)
    chunks = chunk_text(pages)
    embeddings = embed_model.encode(chunks, convert_to_numpy=True)

    vector_store = faiss.IndexFlatL2(embeddings.shape[1])
    vector_store.add(embeddings)

    chunks_store = chunks
    embeddings_store = embeddings

    # Preview table
    table_preview = []
    for i, chunk in enumerate(chunks[:10]):
        table_preview.append({
            "chunk_id": i + 1,
            "text_preview": chunk[:50].replace("\n"," ") + ("..." if len(chunk) > 50 else ""),
            "embedding_preview": np.round(embeddings[i][:5], 4).tolist()
        })

    return {
        "num_chunks": len(chunks),
        "preview": table_preview
    }

# ----------------------------
# Parse student PDF (Question + Answer)
# ----------------------------
def parse_student_pdf_qna(student_pdf_file):
    """
    Parses a PDF where each answer is in format:
    Question: <text>
    Answer: <text>
    Returns a list of (question, answer) tuples.
    """
    pages = load_pdf(student_pdf_file)
    text = "\n".join(pages)
    
    # Regex to match Question: ... Answer: ...
    pattern = re.compile(r"Question:\s*(.+?)\s*Answer:\s*(.+?)(?=Question:|$)", re.DOTALL | re.IGNORECASE)
    qas = pattern.findall(text)
    
    # Strip extra spaces
    qas = [(q.strip(), a.strip()) for q, a in qas]
    return qas

# ----------------------------
# Retrieve relevant chunks and generate enhanced prompt
# ----------------------------
def generate_enhanced_prompts(student_pdf_file, top_k=TOP_K, max_marks=4):
    global vector_store, chunks_store, embeddings_store
    if vector_store is None or chunks_store is None:
        return "Error: No marking scheme vector store loaded. Please upload PDF first."

    qas = parse_student_pdf_qna(student_pdf_file)
    prompts = {}

    for question, answer_text in qas:
        # Embed student answer
        query_vec = embed_model.encode([answer_text], convert_to_numpy=True)
        
        # Search FAISS
        distances, indices = vector_store.search(query_vec, top_k)
        retrieved_chunks = [chunks_store[i] for i in indices[0]]
        
        # Create enhanced prompt
        prompt = f"""Instruction: You are a national exam marker. Compare the student's answer with the marking scheme and award marks according to the scheme. Provide rationale. Award partial marks if some points are covered. Output in JSON.

Question: {question}
Answer: {answer_text}
Marking Scheme Context: {' '.join(retrieved_chunks)}
Maximum Marks: {max_marks}
Guidelines: If answer contains part of correct points, award partial marks proportionally.
Output Format:
{{
  "score": <numeric>,
  "rationale": "<explanation>"
}}
"""
        prompts[question] = prompt

    return prompts

# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks() as demo:
    gr.Markdown("## Vectorization + Retrieval + Enhanced Prompt Generation")

    # Upload marking scheme PDF
    pdf_file = gr.File(label="Upload Marking Scheme PDF")
    vector_output = gr.JSON(label="Vectorization Info")
    submit_vector = gr.Button("Vectorize PDF")
    submit_vector.click(vectorize_pdf, inputs=[pdf_file], outputs=[vector_output])

    # Upload student answer PDF
    student_pdf = gr.File(label="Upload Student Answer PDF")
    prompts_output = gr.JSON(label="Generated Prompts for Marking")
    submit_prompts = gr.Button("Generate Enhanced Prompts")
    submit_prompts.click(generate_enhanced_prompts, inputs=[student_pdf], outputs=[prompts_output])

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)