Spaces:

ST-THOMAS-OF-AQUINAS
/

Vector_emebeding

Sleeping

App Files Files Community

ST-THOMAS-OF-AQUINAS commited on Jan 7

Commit

d60bf93

verified ·

1 Parent(s): f462a1b

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -46

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import gradio as gr
 import PyPDF2
 import re
-import pickle
 from sentence_transformers import SentenceTransformer
 import faiss
-import numpy as np
 # ----------------------------
 # Embedding model
@@ -14,9 +13,10 @@ embed_model = SentenceTransformer("all-mpnet-base-v2")
 # ----------------------------
 # In-memory storage
 # ----------------------------
-vector_store = {}
-chunk_store = {}
-embedding_store = {}
 # ----------------------------
 # PDF Loader and Chunker
@@ -42,63 +42,100 @@ def chunk_text(text_pages, chunk_size=200, overlap=50):
 # ----------------------------
 # Vectorization
 # ----------------------------
-def create_faiss_index(chunks):
     embeddings = embed_model.encode(chunks, convert_to_numpy=True)
-    index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(embeddings)
-    return index, chunks, embeddings
 # ----------------------------
-# Function to print structured table
 # ----------------------------
-def print_vector_table(chunks, embeddings, max_rows=10):
     """
-    Prints a structured table:
-    | Chunk # | Chunk Text (first 50 chars) | Embedding Preview (first 5 dims) |
     """
-    print("\n=== VECTOR TABLE ===\n")
-    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
-        if i >= max_rows:
-            break
-        preview_text = chunk[:50].replace("\n", " ") + ("..." if len(chunk) > 50 else "")
-        preview_emb = np.round(emb[:5], 4)  # show first 5 dimensions
-        print(f"Chunk {i+1}:")
-        print(f"  Text : {preview_text}")
-        print(f"  Embedding (first 5 dims): {preview_emb}\n")
-    print(f"Total chunks: {len(chunks)}\n")
 # ----------------------------
-# Main function
 # ----------------------------
-def vectorize_pdf(marking_scheme_file):
-    # Load PDF text
-    pages = load_pdf(marking_scheme_file)
-    # Chunk PDF
-    chunks = chunk_text(pages)
-    # Create FAISS index + embeddings
-    index, stored_chunks, embeddings = create_faiss_index(chunks)
-    # Save in memory
-    vector_store["marking_scheme"] = index
-    chunk_store["marking_scheme"] = stored_chunks
-    embedding_store["marking_scheme"] = embeddings
-    # Print structured table
-    print_vector_table(stored_chunks, embeddings)
-    return f"Vectorization complete! Number of chunks: {len(chunks)} (see console for structured table preview)"
 # ----------------------------
 # Gradio UI
 # ----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## Upload Marking Scheme PDF for Vectorization")
-    marking_pdf = gr.File(label="Marking Scheme PDF")
-    output = gr.Textbox()
-    submit = gr.Button("Vectorize PDF")
-    submit.click(lambda f: vectorize_pdf(f.name), inputs=[marking_pdf], outputs=[output])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

 import gradio as gr
 import PyPDF2
 import re
+import numpy as np
 from sentence_transformers import SentenceTransformer
 import faiss
 # ----------------------------
 # Embedding model
 # ----------------------------
 # In-memory storage
 # ----------------------------
+vector_store = None
+chunks_store = None
+embeddings_store = None
+TOP_K = 3  # number of chunks to retrieve
 # ----------------------------
 # PDF Loader and Chunker
 # ----------------------------
 # Vectorization
 # ----------------------------
+def vectorize_pdf(marking_scheme_file):
+    global vector_store, chunks_store, embeddings_store
+    # Load PDF text
+    pages = load_pdf(marking_scheme_file)
+    chunks = chunk_text(pages)
+    # Generate embeddings
     embeddings = embed_model.encode(chunks, convert_to_numpy=True)
+    # Build FAISS index
+    vector_store = faiss.IndexFlatL2(embeddings.shape[1])
+    vector_store.add(embeddings)
+    chunks_store = chunks
+    embeddings_store = embeddings
+    # Preview table
+    table_preview = []
+    for i, chunk in enumerate(chunks[:10]):
+        table_preview.append({
+            "chunk_id": i + 1,
+            "text_preview": chunk[:50].replace("\n"," ") + ("..." if len(chunk) > 50 else ""),
+            "embedding_preview": np.round(embeddings[i][:5], 4).tolist()
+        })
+    return {
+        "num_chunks": len(chunks),
+        "preview": table_preview
+    }
 # ----------------------------
+# Parse student PDF (Question + Answer)
 # ----------------------------
+def parse_student_pdf_qna(student_pdf_file):
     """
+    Parses a PDF where each answer is in format:
+    Question: <text>
+    Answer: <text>
+    Returns a list of (question, answer) tuples.
     """
+    pages = load_pdf(student_pdf_file)
+    text = "\n".join(pages)
+    # Regex to match Question: ... Answer: ...
+    pattern = re.compile(r"Question:\s*(.+?)\s*Answer:\s*(.+?)(?=Question:|$)", re.DOTALL | re.IGNORECASE)
+    qas = pattern.findall(text)
+    # Strip extra spaces
+    qas = [(q.strip(), a.strip()) for q, a in qas]
+    return qas
 # ----------------------------
+# Retrieve relevant chunks and format prompt
 # ----------------------------
+def create_prompts(student_pdf_file, top_k=TOP_K):
+    global vector_store, chunks_store, embeddings_store
+    if vector_store is None or chunks_store is None:
+        return "Error: No marking scheme vector store loaded. Please upload PDF first."
+    qas = parse_student_pdf_qna(student_pdf_file)
+    prompts = {}
+    for question, answer_text in qas:
+        # Embed student answer
+        query_vec = embed_model.encode([answer_text], convert_to_numpy=True)
+        # Search FAISS
+        distances, indices = vector_store.search(query_vec, top_k)
+        retrieved_chunks = [chunks_store[i] for i in indices[0]]
+        # Create prompt string
+        prompt = f"Question: {question}\nAnswer: {answer_text}\nMarking Scheme Context: {' '.join(retrieved_chunks)}"
+        prompts[question] = prompt
+    return prompts
 # ----------------------------
 # Gradio UI
 # ----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## Vectorization + Retrieval + Prompt Generation")
+    # Upload marking scheme PDF
+    pdf_file = gr.File(label="Upload Marking Scheme PDF")
+    vector_output = gr.JSON(label="Vectorization Info")
+    submit_vector = gr.Button("Vectorize PDF")
+    submit_vector.click(vectorize_pdf, inputs=[pdf_file], outputs=[vector_output])
+    # Upload student answer PDF
+    student_pdf = gr.File(label="Upload Student Answer PDF")
+    prompts_output = gr.JSON(label="Generated Prompts for Marking")
+    submit_prompts = gr.Button("Generate Prompts")
+    submit_prompts.click(create_prompts, inputs=[student_pdf], outputs=[prompts_output])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)