ST-THOMAS-OF-AQUINAS commited on
Commit
6a34abf
·
verified ·
1 Parent(s): bb3f5c9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+ import re
4
+ import pickle
5
+ from sentence_transformers import SentenceTransformer
6
+ import faiss
7
+ import numpy as np
8
+
9
+ # ----------------------------
10
+ # Embedding model
11
+ # ----------------------------
12
+ embed_model = SentenceTransformer("all-mpnet-base-v2")
13
+
14
+ # ----------------------------
15
+ # In-memory storage
16
+ # ----------------------------
17
+ vector_store = {}
18
+ chunk_store = {}
19
+ embedding_store = {}
20
+
21
+ # ----------------------------
22
+ # PDF Loader and Chunker
23
+ # ----------------------------
24
+ def load_pdf(file):
25
+ pdf_reader = PyPDF2.PdfReader(file)
26
+ text_pages = [page.extract_text() for page in pdf_reader.pages]
27
+ return text_pages
28
+
29
+ def chunk_text(text_pages, chunk_size=200, overlap=50):
30
+ chunks = []
31
+ for page in text_pages:
32
+ if not page:
33
+ continue
34
+ words = re.split(r'\s+', page)
35
+ start = 0
36
+ while start < len(words):
37
+ end = start + chunk_size
38
+ chunks.append(" ".join(words[start:end]))
39
+ start += chunk_size - overlap
40
+ return chunks
41
+
42
+ # ----------------------------
43
+ # Vectorization
44
+ # ----------------------------
45
+ def create_faiss_index(chunks):
46
+ embeddings = embed_model.encode(chunks, convert_to_numpy=True)
47
+ index = faiss.IndexFlatL2(embeddings.shape[1])
48
+ index.add(embeddings)
49
+ return index, chunks, embeddings
50
+
51
+ # ----------------------------
52
+ # Function to print structured table
53
+ # ----------------------------
54
+ def print_vector_table(chunks, embeddings, max_rows=10):
55
+ """
56
+ Prints a structured table:
57
+ | Chunk # | Chunk Text (first 50 chars) | Embedding Preview (first 5 dims) |
58
+ """
59
+ print("\n=== VECTOR TABLE ===\n")
60
+ for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
61
+ if i >= max_rows:
62
+ break
63
+ preview_text = chunk[:50].replace("\n", " ") + ("..." if len(chunk) > 50 else "")
64
+ preview_emb = np.round(emb[:5], 4) # show first 5 dimensions
65
+ print(f"Chunk {i+1}:")
66
+ print(f" Text : {preview_text}")
67
+ print(f" Embedding (first 5 dims): {preview_emb}\n")
68
+ print(f"Total chunks: {len(chunks)}\n")
69
+
70
+ # ----------------------------
71
+ # Main function
72
+ # ----------------------------
73
+ def vectorize_pdf(marking_scheme_file):
74
+ # Load PDF text
75
+ pages = load_pdf(marking_scheme_file)
76
+
77
+ # Chunk PDF
78
+ chunks = chunk_text(pages)
79
+
80
+ # Create FAISS index + embeddings
81
+ index, stored_chunks, embeddings = create_faiss_index(chunks)
82
+
83
+ # Save in memory
84
+ vector_store["marking_scheme"] = index
85
+ chunk_store["marking_scheme"] = stored_chunks
86
+ embedding_store["marking_scheme"] = embeddings
87
+
88
+ # Print structured table
89
+ print_vector_table(stored_chunks, embeddings)
90
+
91
+ return f"Vectorization complete! Number of chunks: {len(chunks)} (see console for structured table preview)"
92
+
93
+ # ----------------------------
94
+ # Gradio UI
95
+ # ----------------------------
96
+ with gr.Blocks() as demo:
97
+ gr.Markdown("## Upload Marking Scheme PDF for Vectorization")
98
+ marking_pdf = gr.File(label="Marking Scheme PDF")
99
+ output = gr.Textbox()
100
+ submit = gr.Button("Vectorize PDF")
101
+ submit.click(lambda f: vectorize_pdf(f.name), inputs=[marking_pdf], outputs=[output])
102
+
103
+ if __name__ == "__main__":
104
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)