ST-THOMAS-OF-AQUINAS commited on
Commit
d60bf93
·
verified ·
1 Parent(s): f462a1b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -46
app.py CHANGED
@@ -1,10 +1,9 @@
1
  import gradio as gr
2
  import PyPDF2
3
  import re
4
- import pickle
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
- import numpy as np
8
 
9
  # ----------------------------
10
  # Embedding model
@@ -14,9 +13,10 @@ embed_model = SentenceTransformer("all-mpnet-base-v2")
14
  # ----------------------------
15
  # In-memory storage
16
  # ----------------------------
17
- vector_store = {}
18
- chunk_store = {}
19
- embedding_store = {}
 
20
 
21
  # ----------------------------
22
  # PDF Loader and Chunker
@@ -42,63 +42,100 @@ def chunk_text(text_pages, chunk_size=200, overlap=50):
42
  # ----------------------------
43
  # Vectorization
44
  # ----------------------------
45
- def create_faiss_index(chunks):
 
 
 
 
 
 
 
46
  embeddings = embed_model.encode(chunks, convert_to_numpy=True)
47
- index = faiss.IndexFlatL2(embeddings.shape[1])
48
- index.add(embeddings)
49
- return index, chunks, embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  # ----------------------------
52
- # Function to print structured table
53
  # ----------------------------
54
- def print_vector_table(chunks, embeddings, max_rows=10):
55
  """
56
- Prints a structured table:
57
- | Chunk # | Chunk Text (first 50 chars) | Embedding Preview (first 5 dims) |
 
 
58
  """
59
- print("\n=== VECTOR TABLE ===\n")
60
- for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
61
- if i >= max_rows:
62
- break
63
- preview_text = chunk[:50].replace("\n", " ") + ("..." if len(chunk) > 50 else "")
64
- preview_emb = np.round(emb[:5], 4) # show first 5 dimensions
65
- print(f"Chunk {i+1}:")
66
- print(f" Text : {preview_text}")
67
- print(f" Embedding (first 5 dims): {preview_emb}\n")
68
- print(f"Total chunks: {len(chunks)}\n")
69
 
70
  # ----------------------------
71
- # Main function
72
  # ----------------------------
73
- def vectorize_pdf(marking_scheme_file):
74
- # Load PDF text
75
- pages = load_pdf(marking_scheme_file)
76
-
77
- # Chunk PDF
78
- chunks = chunk_text(pages)
79
-
80
- # Create FAISS index + embeddings
81
- index, stored_chunks, embeddings = create_faiss_index(chunks)
82
-
83
- # Save in memory
84
- vector_store["marking_scheme"] = index
85
- chunk_store["marking_scheme"] = stored_chunks
86
- embedding_store["marking_scheme"] = embeddings
87
 
88
- # Print structured table
89
- print_vector_table(stored_chunks, embeddings)
90
 
91
- return f"Vectorization complete! Number of chunks: {len(chunks)} (see console for structured table preview)"
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  # ----------------------------
94
  # Gradio UI
95
  # ----------------------------
96
  with gr.Blocks() as demo:
97
- gr.Markdown("## Upload Marking Scheme PDF for Vectorization")
98
- marking_pdf = gr.File(label="Marking Scheme PDF")
99
- output = gr.Textbox()
100
- submit = gr.Button("Vectorize PDF")
101
- submit.click(lambda f: vectorize_pdf(f.name), inputs=[marking_pdf], outputs=[output])
 
 
 
 
 
 
 
 
102
 
103
  if __name__ == "__main__":
104
  demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
 
1
  import gradio as gr
2
  import PyPDF2
3
  import re
4
+ import numpy as np
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
 
7
 
8
  # ----------------------------
9
  # Embedding model
 
13
  # ----------------------------
14
  # In-memory storage
15
  # ----------------------------
16
+ vector_store = None
17
+ chunks_store = None
18
+ embeddings_store = None
19
+ TOP_K = 3 # number of chunks to retrieve
20
 
21
  # ----------------------------
22
  # PDF Loader and Chunker
 
42
  # ----------------------------
43
  # Vectorization
44
  # ----------------------------
45
+ def vectorize_pdf(marking_scheme_file):
46
+ global vector_store, chunks_store, embeddings_store
47
+
48
+ # Load PDF text
49
+ pages = load_pdf(marking_scheme_file)
50
+ chunks = chunk_text(pages)
51
+
52
+ # Generate embeddings
53
  embeddings = embed_model.encode(chunks, convert_to_numpy=True)
54
+
55
+ # Build FAISS index
56
+ vector_store = faiss.IndexFlatL2(embeddings.shape[1])
57
+ vector_store.add(embeddings)
58
+
59
+ chunks_store = chunks
60
+ embeddings_store = embeddings
61
+
62
+ # Preview table
63
+ table_preview = []
64
+ for i, chunk in enumerate(chunks[:10]):
65
+ table_preview.append({
66
+ "chunk_id": i + 1,
67
+ "text_preview": chunk[:50].replace("\n"," ") + ("..." if len(chunk) > 50 else ""),
68
+ "embedding_preview": np.round(embeddings[i][:5], 4).tolist()
69
+ })
70
+
71
+ return {
72
+ "num_chunks": len(chunks),
73
+ "preview": table_preview
74
+ }
75
 
76
  # ----------------------------
77
+ # Parse student PDF (Question + Answer)
78
  # ----------------------------
79
+ def parse_student_pdf_qna(student_pdf_file):
80
  """
81
+ Parses a PDF where each answer is in format:
82
+ Question: <text>
83
+ Answer: <text>
84
+ Returns a list of (question, answer) tuples.
85
  """
86
+ pages = load_pdf(student_pdf_file)
87
+ text = "\n".join(pages)
88
+
89
+ # Regex to match Question: ... Answer: ...
90
+ pattern = re.compile(r"Question:\s*(.+?)\s*Answer:\s*(.+?)(?=Question:|$)", re.DOTALL | re.IGNORECASE)
91
+ qas = pattern.findall(text)
92
+
93
+ # Strip extra spaces
94
+ qas = [(q.strip(), a.strip()) for q, a in qas]
95
+ return qas
96
 
97
  # ----------------------------
98
+ # Retrieve relevant chunks and format prompt
99
  # ----------------------------
100
+ def create_prompts(student_pdf_file, top_k=TOP_K):
101
+ global vector_store, chunks_store, embeddings_store
102
+ if vector_store is None or chunks_store is None:
103
+ return "Error: No marking scheme vector store loaded. Please upload PDF first."
 
 
 
 
 
 
 
 
 
 
104
 
105
+ qas = parse_student_pdf_qna(student_pdf_file)
106
+ prompts = {}
107
 
108
+ for question, answer_text in qas:
109
+ # Embed student answer
110
+ query_vec = embed_model.encode([answer_text], convert_to_numpy=True)
111
+
112
+ # Search FAISS
113
+ distances, indices = vector_store.search(query_vec, top_k)
114
+ retrieved_chunks = [chunks_store[i] for i in indices[0]]
115
+
116
+ # Create prompt string
117
+ prompt = f"Question: {question}\nAnswer: {answer_text}\nMarking Scheme Context: {' '.join(retrieved_chunks)}"
118
+ prompts[question] = prompt
119
+
120
+ return prompts
121
 
122
  # ----------------------------
123
  # Gradio UI
124
  # ----------------------------
125
  with gr.Blocks() as demo:
126
+ gr.Markdown("## Vectorization + Retrieval + Prompt Generation")
127
+
128
+ # Upload marking scheme PDF
129
+ pdf_file = gr.File(label="Upload Marking Scheme PDF")
130
+ vector_output = gr.JSON(label="Vectorization Info")
131
+ submit_vector = gr.Button("Vectorize PDF")
132
+ submit_vector.click(vectorize_pdf, inputs=[pdf_file], outputs=[vector_output])
133
+
134
+ # Upload student answer PDF
135
+ student_pdf = gr.File(label="Upload Student Answer PDF")
136
+ prompts_output = gr.JSON(label="Generated Prompts for Marking")
137
+ submit_prompts = gr.Button("Generate Prompts")
138
+ submit_prompts.click(create_prompts, inputs=[student_pdf], outputs=[prompts_output])
139
 
140
  if __name__ == "__main__":
141
  demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)