Spaces:

ST-THOMAS-OF-AQUINAS
/

Vector_emebeding

Sleeping

App Files Files Community

ST-THOMAS-OF-AQUINAS commited on Jan 7

Commit

6a34abf

verified ·

1 Parent(s): bb3f5c9

Create app.py

Browse files

Files changed (1) hide show

app.py +104 -0

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import gradio as gr
+import PyPDF2
+import re
+import pickle
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+# ----------------------------
+# Embedding model
+# ----------------------------
+embed_model = SentenceTransformer("all-mpnet-base-v2")
+# ----------------------------
+# In-memory storage
+# ----------------------------
+vector_store = {}
+chunk_store = {}
+embedding_store = {}
+# ----------------------------
+# PDF Loader and Chunker
+# ----------------------------
+def load_pdf(file):
+    pdf_reader = PyPDF2.PdfReader(file)
+    text_pages = [page.extract_text() for page in pdf_reader.pages]
+    return text_pages
+def chunk_text(text_pages, chunk_size=200, overlap=50):
+    chunks = []
+    for page in text_pages:
+        if not page:
+            continue
+        words = re.split(r'\s+', page)
+        start = 0
+        while start < len(words):
+            end = start + chunk_size
+            chunks.append(" ".join(words[start:end]))
+            start += chunk_size - overlap
+    return chunks
+# ----------------------------
+# Vectorization
+# ----------------------------
+def create_faiss_index(chunks):
+    embeddings = embed_model.encode(chunks, convert_to_numpy=True)
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    return index, chunks, embeddings
+# ----------------------------
+# Function to print structured table
+# ----------------------------
+def print_vector_table(chunks, embeddings, max_rows=10):
+    """
+    Prints a structured table:
+    | Chunk # | Chunk Text (first 50 chars) | Embedding Preview (first 5 dims) |
+    """
+    print("\n=== VECTOR TABLE ===\n")
+    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
+        if i >= max_rows:
+            break
+        preview_text = chunk[:50].replace("\n", " ") + ("..." if len(chunk) > 50 else "")
+        preview_emb = np.round(emb[:5], 4)  # show first 5 dimensions
+        print(f"Chunk {i+1}:")
+        print(f"  Text : {preview_text}")
+        print(f"  Embedding (first 5 dims): {preview_emb}\n")
+    print(f"Total chunks: {len(chunks)}\n")
+# ----------------------------
+# Main function
+# ----------------------------
+def vectorize_pdf(marking_scheme_file):
+    # Load PDF text
+    pages = load_pdf(marking_scheme_file)
+    # Chunk PDF
+    chunks = chunk_text(pages)
+    # Create FAISS index + embeddings
+    index, stored_chunks, embeddings = create_faiss_index(chunks)
+    # Save in memory
+    vector_store["marking_scheme"] = index
+    chunk_store["marking_scheme"] = stored_chunks
+    embedding_store["marking_scheme"] = embeddings
+    # Print structured table
+    print_vector_table(stored_chunks, embeddings)
+    return f"Vectorization complete! Number of chunks: {len(chunks)} (see console for structured table preview)"
+# ----------------------------
+# Gradio UI
+# ----------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("## Upload Marking Scheme PDF for Vectorization")
+    marking_pdf = gr.File(label="Marking Scheme PDF")
+    output = gr.Textbox()
+    submit = gr.Button("Vectorize PDF")
+    submit.click(lambda f: vectorize_pdf(f.name), inputs=[marking_pdf], outputs=[output])
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)