Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import PyPDF2
|
| 3 |
+
import re
|
| 4 |
+
import pickle
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
import faiss
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
# ----------------------------
|
| 10 |
+
# Embedding model
|
| 11 |
+
# ----------------------------
|
| 12 |
+
embed_model = SentenceTransformer("all-mpnet-base-v2")
|
| 13 |
+
|
| 14 |
+
# ----------------------------
|
| 15 |
+
# In-memory storage
|
| 16 |
+
# ----------------------------
|
| 17 |
+
vector_store = {}
|
| 18 |
+
chunk_store = {}
|
| 19 |
+
embedding_store = {}
|
| 20 |
+
|
| 21 |
+
# ----------------------------
|
| 22 |
+
# PDF Loader and Chunker
|
| 23 |
+
# ----------------------------
|
| 24 |
+
def load_pdf(file):
|
| 25 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
| 26 |
+
text_pages = [page.extract_text() for page in pdf_reader.pages]
|
| 27 |
+
return text_pages
|
| 28 |
+
|
| 29 |
+
def chunk_text(text_pages, chunk_size=200, overlap=50):
|
| 30 |
+
chunks = []
|
| 31 |
+
for page in text_pages:
|
| 32 |
+
if not page:
|
| 33 |
+
continue
|
| 34 |
+
words = re.split(r'\s+', page)
|
| 35 |
+
start = 0
|
| 36 |
+
while start < len(words):
|
| 37 |
+
end = start + chunk_size
|
| 38 |
+
chunks.append(" ".join(words[start:end]))
|
| 39 |
+
start += chunk_size - overlap
|
| 40 |
+
return chunks
|
| 41 |
+
|
| 42 |
+
# ----------------------------
|
| 43 |
+
# Vectorization
|
| 44 |
+
# ----------------------------
|
| 45 |
+
def create_faiss_index(chunks):
|
| 46 |
+
embeddings = embed_model.encode(chunks, convert_to_numpy=True)
|
| 47 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
| 48 |
+
index.add(embeddings)
|
| 49 |
+
return index, chunks, embeddings
|
| 50 |
+
|
| 51 |
+
# ----------------------------
|
| 52 |
+
# Function to print structured table
|
| 53 |
+
# ----------------------------
|
| 54 |
+
def print_vector_table(chunks, embeddings, max_rows=10):
|
| 55 |
+
"""
|
| 56 |
+
Prints a structured table:
|
| 57 |
+
| Chunk # | Chunk Text (first 50 chars) | Embedding Preview (first 5 dims) |
|
| 58 |
+
"""
|
| 59 |
+
print("\n=== VECTOR TABLE ===\n")
|
| 60 |
+
for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
|
| 61 |
+
if i >= max_rows:
|
| 62 |
+
break
|
| 63 |
+
preview_text = chunk[:50].replace("\n", " ") + ("..." if len(chunk) > 50 else "")
|
| 64 |
+
preview_emb = np.round(emb[:5], 4) # show first 5 dimensions
|
| 65 |
+
print(f"Chunk {i+1}:")
|
| 66 |
+
print(f" Text : {preview_text}")
|
| 67 |
+
print(f" Embedding (first 5 dims): {preview_emb}\n")
|
| 68 |
+
print(f"Total chunks: {len(chunks)}\n")
|
| 69 |
+
|
| 70 |
+
# ----------------------------
|
| 71 |
+
# Main function
|
| 72 |
+
# ----------------------------
|
| 73 |
+
def vectorize_pdf(marking_scheme_file):
|
| 74 |
+
# Load PDF text
|
| 75 |
+
pages = load_pdf(marking_scheme_file)
|
| 76 |
+
|
| 77 |
+
# Chunk PDF
|
| 78 |
+
chunks = chunk_text(pages)
|
| 79 |
+
|
| 80 |
+
# Create FAISS index + embeddings
|
| 81 |
+
index, stored_chunks, embeddings = create_faiss_index(chunks)
|
| 82 |
+
|
| 83 |
+
# Save in memory
|
| 84 |
+
vector_store["marking_scheme"] = index
|
| 85 |
+
chunk_store["marking_scheme"] = stored_chunks
|
| 86 |
+
embedding_store["marking_scheme"] = embeddings
|
| 87 |
+
|
| 88 |
+
# Print structured table
|
| 89 |
+
print_vector_table(stored_chunks, embeddings)
|
| 90 |
+
|
| 91 |
+
return f"Vectorization complete! Number of chunks: {len(chunks)} (see console for structured table preview)"
|
| 92 |
+
|
| 93 |
+
# ----------------------------
|
| 94 |
+
# Gradio UI
|
| 95 |
+
# ----------------------------
|
| 96 |
+
with gr.Blocks() as demo:
|
| 97 |
+
gr.Markdown("## Upload Marking Scheme PDF for Vectorization")
|
| 98 |
+
marking_pdf = gr.File(label="Marking Scheme PDF")
|
| 99 |
+
output = gr.Textbox()
|
| 100 |
+
submit = gr.Button("Vectorize PDF")
|
| 101 |
+
submit.click(lambda f: vectorize_pdf(f.name), inputs=[marking_pdf], outputs=[output])
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
|