Davide Panza commited on
Commit
1d8ed3b
·
verified ·
1 Parent(s): 7ef71d7

Upload 56 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app/__init__.py +0 -0
  2. app/__pycache__/__init__.cpython-312.pyc +0 -0
  3. app/__pycache__/download_questions.cpython-312.pyc +0 -0
  4. app/__pycache__/main_IO.cpython-312.pyc +0 -0
  5. app/__pycache__/utils.cpython-312.pyc +0 -0
  6. app/backend/__init__.py +0 -0
  7. app/backend/__pycache__/__init__.cpython-312.pyc +0 -0
  8. app/backend/__pycache__/chromadb_utils.cpython-312.pyc +0 -0
  9. app/backend/__pycache__/chunks_processing.cpython-312.pyc +0 -0
  10. app/backend/__pycache__/get_requests.cpython-312.pyc +0 -0
  11. app/backend/__pycache__/messages_templates.cpython-312.pyc +0 -0
  12. app/backend/__pycache__/raw_text_processing.cpython-312.pyc +0 -0
  13. app/backend/__pycache__/runpod_client.cpython-312.pyc +0 -0
  14. app/backend/__pycache__/text_processing.cpython-312.pyc +0 -0
  15. app/backend/__pycache__/toc_parser.cpython-312.pyc +0 -0
  16. app/backend/chromadb_utils.py +61 -0
  17. app/backend/chunks_processing.py +32 -0
  18. app/backend/get_requests.py +37 -0
  19. app/backend/messages_templates.py +265 -0
  20. app/backend/raw_text_processing.py +167 -0
  21. app/backend/runpod_client.py +68 -0
  22. app/backend/text_processing.py +95 -0
  23. app/chromadb_model/1_Pooling/config.json +10 -0
  24. app/chromadb_model/README.md +173 -0
  25. app/chromadb_model/config.json +26 -0
  26. app/chromadb_model/config_sentence_transformers.json +10 -0
  27. app/chromadb_model/model.safetensors +3 -0
  28. app/chromadb_model/modules.json +20 -0
  29. app/chromadb_model/sentence_bert_config.json +4 -0
  30. app/chromadb_model/special_tokens_map.json +37 -0
  31. app/chromadb_model/tokenizer.json +0 -0
  32. app/chromadb_model/tokenizer_config.json +65 -0
  33. app/chromadb_model/vocab.txt +0 -0
  34. app/download_questions.py +23 -0
  35. app/main.py +170 -0
  36. app/main_IO.py +112 -0
  37. app/pages/1_chapter_questions.py +90 -0
  38. app/pages/2_topic_questions.py +106 -0
  39. app/pages/3_inspect_pdf.py +8 -0
  40. app/pages/__init__.py +0 -0
  41. app/pages/__pycache__/__init__.cpython-312.pyc +0 -0
  42. app/pages/__pycache__/page1_utils.cpython-312.pyc +0 -0
  43. app/pages/utils_chapter/__init__.py +0 -0
  44. app/pages/utils_chapter/__pycache__/__init__.cpython-312.pyc +0 -0
  45. app/pages/utils_chapter/__pycache__/chapter_extraction.cpython-312.pyc +0 -0
  46. app/pages/utils_chapter/__pycache__/chapter_selection.cpython-312.pyc +0 -0
  47. app/pages/utils_chapter/__pycache__/display_pages.cpython-312.pyc +0 -0
  48. app/pages/utils_chapter/__pycache__/display_questions.cpython-312.pyc +0 -0
  49. app/pages/utils_chapter/__pycache__/download_questions.cpython-312.pyc +0 -0
  50. app/pages/utils_chapter/__pycache__/page1_utils.cpython-312.pyc +0 -0
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (155 Bytes). View file
 
app/__pycache__/download_questions.cpython-312.pyc ADDED
Binary file (1.44 kB). View file
 
app/__pycache__/main_IO.cpython-312.pyc ADDED
Binary file (4.73 kB). View file
 
app/__pycache__/utils.cpython-312.pyc ADDED
Binary file (2.64 kB). View file
 
app/backend/__init__.py ADDED
File without changes
app/backend/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (163 Bytes). View file
 
app/backend/__pycache__/chromadb_utils.cpython-312.pyc ADDED
Binary file (2.32 kB). View file
 
app/backend/__pycache__/chunks_processing.cpython-312.pyc ADDED
Binary file (2.05 kB). View file
 
app/backend/__pycache__/get_requests.cpython-312.pyc ADDED
Binary file (1.95 kB). View file
 
app/backend/__pycache__/messages_templates.cpython-312.pyc ADDED
Binary file (8.35 kB). View file
 
app/backend/__pycache__/raw_text_processing.cpython-312.pyc ADDED
Binary file (8.14 kB). View file
 
app/backend/__pycache__/runpod_client.cpython-312.pyc ADDED
Binary file (3.37 kB). View file
 
app/backend/__pycache__/text_processing.cpython-312.pyc ADDED
Binary file (3.7 kB). View file
 
app/backend/__pycache__/toc_parser.cpython-312.pyc ADDED
Binary file (821 Bytes). View file
 
app/backend/chromadb_utils.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from chromadb.utils import embedding_functions
3
+ from .text_processing import text_chunking
4
+
5
+
6
+ def initialize_chromadb(EMBEDDING_MODEL, local_model_path=None):
7
+ """
8
+ Initialize ChromaDB client and embedding function, using a local model path if provided.
9
+ """
10
+ client = chromadb.Client()
11
+
12
+ if local_model_path:
13
+ embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
14
+ model_name=local_model_path
15
+ )
16
+ else:
17
+ embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
18
+ model_name=EMBEDDING_MODEL
19
+ )
20
+
21
+ return client, embedding_func
22
+
23
+
24
+ def initialize_collection(client, embedding_func, collection_name):
25
+ """
26
+ Initialize a collection in ChromaDB.
27
+ """
28
+ collection = client.get_or_create_collection(
29
+ name=collection_name,
30
+ embedding_function=embedding_func,
31
+ metadata={"hnsw:space": "cosine"},
32
+ )
33
+
34
+ return collection
35
+
36
+
37
+ def update_collection(
38
+ collection,
39
+ text,
40
+ max_words=200,
41
+ min_words=100,
42
+ overlap_sentences=3,
43
+ ):
44
+ """
45
+ Update the ChromaDB collection with text chunks.
46
+ Args:
47
+ collection: ChromaDB collection object.
48
+ text (str): The text to be chunked and added.
49
+ max_words (int): Maximum number of words per chunk.
50
+ min_words (int): Minimum number of words per chunk.
51
+ overlap_sentences (int): Number of sentences to overlap between chunks.
52
+ Returns:
53
+ None
54
+ """
55
+ chunks = text_chunking(text, max_words=max_words, min_words=min_words, overlap_sentences=overlap_sentences)
56
+ collection.add(
57
+ documents=chunks,
58
+ ids=[f"chunk_{j:04d}" for j in range(len(chunks))],
59
+ metadatas=[{"chunk_index": j} for j in range(len(chunks))]
60
+ )
61
+
app/backend/chunks_processing.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import streamlit as st
3
+
4
+
5
+ def query_collection(collection, query='', nresults=3, context_multiplier=2, sim_th=None):
6
+ """Get relevant text from a collection for a given query"""
7
+
8
+ query_result = collection.query(query_texts=query, n_results=nresults*context_multiplier)
9
+ docs = query_result.get('documents')[0]
10
+
11
+ if sim_th is not None:
12
+ similarities = [1 - d for d in query_result.get("distances")[0]]
13
+ relevant_docs = [d for d, s in zip(docs, similarities) if s >= sim_th]
14
+ return ''.join(relevant_docs)
15
+ return docs
16
+
17
+
18
+ def get_chapter_context(chapters, chapter_number, n_questions):
19
+ chapter = chapters[chapter_number]
20
+ print(chapter.keys())
21
+ if chapter is None:
22
+ raise ValueError(f"Chapter {chapter_number} not found in the chapters list.")
23
+ if 'chunks' not in chapter:
24
+ raise ValueError(f"Chapter {chapter_number} does not contain 'text' key.")
25
+
26
+ n_chunks = len(chapter['chunks'])
27
+ if n_chunks == 0:
28
+ raise ValueError(f"Chapter {chapter_number} has no chunks to process.")
29
+
30
+ chunks_indices = random.sample(range(n_chunks), min(n_questions, n_chunks))
31
+ st.session_state['chapter_selected_chunks'] = [chapter['chunks'][i] for i in chunks_indices]
32
+
app/backend/get_requests.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.backend.runpod_client import format_messages_as_prompt, run_prompt, clean_and_parse_json
2
+ from app.backend.messages_templates import toc_prompt, chapter_prompt, chapter_prompt_edgecase
3
+ import streamlit as st
4
+
5
+
6
+ def extract_chapters_from_toc(toc_text: str):
7
+ prompt = toc_prompt(toc_text)
8
+ # prompt = format_messages_as_prompt(messages) get rid of this
9
+ print("use prompt optimized for gemma3")
10
+ raw_output = run_prompt(prompt)
11
+ st.session_state['chapters_dict'] = clean_and_parse_json(raw_output)
12
+
13
+
14
+ def generate_questions_from_chapter(chunks, num_questions, max_questions=5):
15
+ prompt = chapter_prompt(contexts=chunks, num_questions=num_questions, max_questions=max_questions)
16
+ # prompt = format_messages_as_prompt(messages) get rid of this
17
+ print("use prompt optimized for gemma3")
18
+ raw_output = run_prompt(prompt)
19
+ try:
20
+ generated_questions = clean_and_parse_json(raw_output)
21
+ st.success("Questions generated successfully!")
22
+ return generated_questions
23
+ except:
24
+ print("Error parsing JSON")
25
+
26
+
27
+ def generate_questions_from_chapter_edgecase(chunks, num_questions, max_questions=5):
28
+ prompt = chapter_prompt_edgecase(grouped_chunks=chunks, num_questions=num_questions, max_questions=max_questions)
29
+ # prompt = format_messages_as_prompt(messages) get rid of this
30
+ print("use prompt optimized for gemma3")
31
+ raw_output = run_prompt(prompt)
32
+ try:
33
+ generated_questions = clean_and_parse_json(raw_output)
34
+ st.success("Questions generated successfully!")
35
+ return generated_questions
36
+ except:
37
+ print("Error parsing JSON")
app/backend/messages_templates.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # def get_toc_extraction_messages(toc_text: str):
2
+ # return [
3
+ # {
4
+ # "role": "system",
5
+ # "content": "You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text. If you cannot find clear chapter information, you return an empty array. You do not guess chapter titles or page numbers."
6
+ # },
7
+ # {
8
+ # "role": "user",
9
+ # "content": "I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections. Do not make up any information."
10
+ # },
11
+ # {
12
+ # "role": "assistant",
13
+ # "content": "I understand. I will extract ONLY the main chapters that are explicitly shown in your table of contents. I will not invent, guess, or hallucinate any chapter titles or page numbers. I will only use the exact information present in the document."
14
+ # },
15
+ # {
16
+ # "role": "user",
17
+ # "content": f"""Here is the table of contents:
18
+
19
+ # {toc_text}
20
+
21
+ # WARNING: DO NOT HALLUCINATE OR INVENT INFORMATION
22
+ # - Do NOT make up chapter titles like "Probability", "Statistical Inference", "Linear Regression"
23
+ # - Do NOT guess page numbers
24
+ # - Do NOT create generic textbook chapters
25
+ # - ONLY extract what you can clearly see in the provided text
26
+
27
+ # CRITICAL RULES:
28
+ # 1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
29
+ # 2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
30
+ # 3. Use the EXACT chapter titles shown in the document
31
+ # 4. Use the EXACT page numbers shown in the document
32
+ # 5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
33
+ # 6. Calculate end pages as: next chapter's start page minus 1
34
+ # 7. Return ONLY valid JSON - no explanations, no markdown formatting
35
+ # 8. If you cannot clearly identify chapters, return empty array []
36
+
37
+ # Look for patterns like:
38
+ # - "1 Probability Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1"
39
+ # - "2 Distribution Theory and Statistical Models . . . . . . . . . . . . . . . . 155"
40
+ # - "3 Basic Statistical Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 205"
41
+
42
+ # DO NOT extract lines like:
43
+ # - "1.1 Some Important Music Concepts . . . . . . . . . . . 3"
44
+ # - "Preface . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v"
45
+
46
+ # Use ONLY the exact titles from the document. Do not shorten or modify them.
47
+
48
+ # Return JSON array: [{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X}}]
49
+
50
+ # REMEMBER: Extract only what is explicitly visible in the text. Do not hallucinate. Be complete and extract all chapters that are clearly numbered. y chapters, return an empty array []."""
51
+ # },
52
+ # {
53
+ # "role": "assistant",
54
+ # "content": "I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers. I will not invent or hallucinate any information."
55
+ # }
56
+ # ]
57
+
58
+
59
+ def toc_prompt(toc_text: str):
60
+ # Convert to Gemma 3 format - single string with proper turn markers
61
+ prompt = f"""<start_of_turn>user
62
+ You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text. If you cannot find clear chapter information, you return an empty array.
63
+
64
+ I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections. Do not make up any information.
65
+
66
+ Here is the table of contents:
67
+
68
+ {toc_text}
69
+
70
+ WARNING: DO NOT HALLUCINATE OR INVENT INFORMATION
71
+ - Do NOT make up chapter titles like "Probability", "Statistical Inference", "Linear Regression"
72
+ - Do NOT guess page numbers
73
+ - Do NOT create generic textbook chapters
74
+ - ONLY extract what you can clearly see in the provided text
75
+
76
+ CRITICAL RULES:
77
+ 1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
78
+ 2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
79
+ 3. Use the EXACT chapter titles shown in the document
80
+ 4. Use the EXACT page numbers shown in the document
81
+ 5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
82
+ 6. Calculate end pages as: next chapter's start page minus 1
83
+ 7. Return ONLY valid JSON - no explanations, no markdown formatting
84
+ 8. If you cannot clearly identify chapters, return empty array []
85
+
86
+ Look for patterns like:
87
+ - "1 Probability Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1"
88
+ - "2 Distribution Theory and Statistical Models . . . . . . . . . . . . . . . . 155"
89
+ - "3 Basic Statistical Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 205"
90
+
91
+ DO NOT extract lines like:
92
+ - "1.1 Some Important Music Concepts . . . . . . . . . . . 3"
93
+ - "Preface . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v"
94
+
95
+ Use ONLY the exact titles from the document. Do not shorten or modify them.
96
+
97
+ Return JSON array: [{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X}}]
98
+
99
+ Extract only what is explicitly visible in the text. Do not hallucinate. Be complete and extract all chapters that are clearly numbered. If no clear main chapters, return an empty array [].<end_of_turn>
100
+ <start_of_turn>model
101
+ I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers. I will not invent or hallucinate any information.
102
+
103
+ Looking at the provided table of contents, I will now extract the main chapters:<end_of_turn>
104
+ <start_of_turn>user
105
+ Perfect. Now provide the JSON array with the extracted chapters.<end_of_turn>
106
+ <start_of_turn>model
107
+ """
108
+
109
+ return prompt
110
+
111
+
112
+ def chapter_prompt(contexts, num_questions, max_questions=5):
113
+ """
114
+ Create a prompt formatted for Gemma 3 12B-IT model.
115
+ This prompt is designed to generate diverse questions based on provided text contexts.
116
+ Args:
117
+ contexts (list): List of text contexts to base questions on.
118
+ num_questions (int): Number of questions to generate.
119
+ max_questions (int): Maximum number of questions allowed.
120
+ Returns:
121
+ str: Formatted prompt string for Gemma 3 model.
122
+ """
123
+
124
+ # Gemma uses special tokens for instruction tuning
125
+ prompt = """<start_of_turn>user
126
+ You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
127
+
128
+ IMPORTANT REQUIREMENTS:
129
+ 1. Output MUST be valid JSON format
130
+ 2. Generate EXACTLY {num_questions} questions
131
+ 3. Each question must have a complete answer from the contexts
132
+ 4. Vary question types (what, why, how, when, explain, compare)
133
+ 5. Do not generate yes/no questions
134
+ 6. Answers should be 1-3 sentences long
135
+
136
+ CONTEXTS:
137
+ {contexts}
138
+
139
+ OUTPUT FORMAT - Return ONLY valid JSON array:
140
+ [
141
+ {{"question": "Your question here?", "answer": "Complete answer from the context"}},
142
+ {{"question": "Another question?", "answer": "Another answer"}}
143
+ ]
144
+
145
+ Generate the questions now:<end_of_turn>
146
+ <start_of_turn>model
147
+ """.format(
148
+ num_questions=min(num_questions, max_questions),
149
+ contexts=format_contexts(contexts)
150
+ )
151
+
152
+ return prompt
153
+
154
+ def chapter_prompt_edgecase(grouped_chunks, num_questions, max_questions=5):
155
+ """
156
+ Create a prompt formatted for Gemma 3 12B-IT model.
157
+ This prompt is designed to handle edge cases where contexts retrieved are less than the number of questions requested.
158
+ Args:
159
+ contexts (list): List of text contexts to format.
160
+ Returns:
161
+ str: Formatted string of contexts.
162
+ """
163
+
164
+ prompt = """<start_of_turn>user
165
+ Generate {num_questions} questions from the following contexts. You may:
166
+ - Generate one or more questions from each context
167
+ - Use multiple contexts for a single question
168
+ - Skip contexts if they don't contain meaningful information
169
+
170
+ REQUIREMENTS:
171
+ 1. Output valid JSON array format
172
+ 2. Generate EXACTLY {num_questions} questions
173
+ 3. Each answer must be found in the provided contexts
174
+ 4. Create diverse question types
175
+ 5. Reference which context group(s) you used
176
+
177
+ CONTEXT GROUPS:
178
+ {context_groups}
179
+
180
+ OUTPUT FORMAT - Return ONLY this JSON structure:
181
+ [
182
+ {{"question": "Question text?", "answer": "Answer text", "context_used": [1, 2]}},
183
+ {{"question": "Question text?", "answer": "Answer text", "context_used": [1]}}
184
+ ]
185
+
186
+ Generate the questions:<end_of_turn>
187
+ <start_of_turn>model
188
+ """.format(
189
+ num_questions=min(num_questions, max_questions),
190
+ context_groups=format_contexts(grouped_chunks)
191
+ )
192
+
193
+ return prompt
194
+
195
+
196
+ def book_prompt(contexts, num_questions, user_query=None, max_questions=5):
197
+ """
198
+ Create a prompt formatted for Gemma 3 12B-IT model with topic awareness.
199
+
200
+ Args:
201
+ contexts (list): List of text contexts retrieved based on user query
202
+ num_questions (int): Number of questions to generate
203
+ user_query (str): The original user query/topic
204
+ max_questions (int): Maximum number of questions allowed
205
+
206
+ Returns:
207
+ str: Formatted prompt string for Gemma 3 model
208
+ """
209
+
210
+ num_questions = min(num_questions, max_questions)
211
+
212
+ # Build topic context section if query provided
213
+ topic_context = ""
214
+ if user_query:
215
+ topic_context = f"""
216
+ TOPIC FOCUS: {user_query}
217
+ The following contexts were retrieved based on this topic. Generate questions that:
218
+ - Relate to the main topic: "{user_query}"
219
+ - Explore different aspects of this topic found in the contexts
220
+ - Connect the topic to broader concepts when relevant
221
+
222
+ """
223
+
224
+ prompt = """<start_of_turn>user
225
+ You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
226
+ {topic_context}
227
+ IMPORTANT REQUIREMENTS:
228
+ 1. Output MUST be valid JSON format
229
+ 2. Generate EXACTLY {num_questions} questions
230
+ 3. Each question must have a complete answer from the contexts
231
+ 4. Vary question types (what, why, how, when, explain, compare)
232
+ 5. Do not generate yes/no questions
233
+ 6. Answers should be 1-3 sentences long
234
+ 7. Questions should explore different aspects of the topic
235
+
236
+ CONTEXTS (Retrieved based on topic: "{query}"):
237
+ {contexts}
238
+
239
+ OUTPUT FORMAT - Return ONLY valid JSON array:
240
+ [
241
+ {{"question": "Your question here?", "answer": "Complete answer from the context"}},
242
+ {{"question": "Another question?", "answer": "Another answer"}}
243
+ ]
244
+
245
+ Generate the questions now:<end_of_turn>
246
+ <start_of_turn>model
247
+ """.format(
248
+ num_questions=num_questions,
249
+ topic_context=topic_context,
250
+ query=user_query if user_query else "the provided content",
251
+ contexts=format_contexts(contexts)
252
+ )
253
+
254
+ return prompt
255
+
256
+
257
+ def format_contexts(contexts):
258
+ """
259
+ Format contexts for better readability.
260
+ """
261
+ formatted = ""
262
+ for i, context in enumerate(contexts, 1):
263
+ formatted += f"Context {i}:\n{context.strip()}\n\n"
264
+ return formatted.strip()
265
+
app/backend/raw_text_processing.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import warnings
3
+ import streamlit as st
4
+
5
+
6
+ def extract_page_data_fitz(doc):
7
+ """
8
+ Extracts page numbers and text from a PDF file using PyMuPDF.
9
+ The function looks for page numbers in the top and bottom 15% of each page.
10
+ It returns a list of dictionaries, each containing the page index, page number,
11
+ and the full text of the page.
12
+ """
13
+ pages_data = []
14
+
15
+ for i, page in enumerate(doc):
16
+ height = page.rect.height
17
+ width = page.rect.width
18
+
19
+ top_rect = fitz.Rect(0, 0, width, height * 0.15)
20
+ bottom_rect = fitz.Rect(0, height * 0.85, width, height)
21
+
22
+ top_text = page.get_text("text", clip=top_rect).split()
23
+ bottom_text = page.get_text("text", clip=bottom_rect).split()
24
+
25
+ found_number = next((int(text) for text in top_text + bottom_text if text.isdigit()), None)
26
+ full_text = page.get_text("text")
27
+
28
+ pages_data.append({
29
+ "index": i,
30
+ "number": found_number,
31
+ "content": full_text
32
+ })
33
+
34
+ return pages_data
35
+
36
+
37
+ def correct_page_numbers(pages_data, sequence_length=10):
38
+ """
39
+ Corrects page numbers by finding the first sequence of consecutive values,
40
+ filling gaps forward and backward, and setting values < 1 to None.
41
+ Returns the index of the first page numbered 1, or None if no sequence is found.
42
+ """
43
+ try:
44
+ seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]
45
+
46
+ for start in range(len(seen) - sequence_length + 1):
47
+ if all(seen[start + j][1] == seen[start][1] + j for j in range(sequence_length)):
48
+ base_index, base_number = seen[start]
49
+ break
50
+ else:
51
+ return None
52
+
53
+ for offset, page in enumerate(pages_data[base_index:], start=0):
54
+ page["number"] = base_number + offset
55
+
56
+ for offset in range(1, base_index + 1):
57
+ page = pages_data[base_index - offset]
58
+ page["number"] = base_number - offset
59
+
60
+ for page in pages_data:
61
+ if page["number"] < 1:
62
+ page["number"] = None
63
+
64
+ return next((page['index'] for page in pages_data if page["number"] == 1), None)
65
+
66
+ except Exception:
67
+ return None
68
+
69
+
70
+ def extract_text(doc, start_chapter=None):
71
+ """
72
+ Extracts the text of the book starting from the specified page index.
73
+ If no start_chapter is provided, it returns the whole doc.
74
+ """
75
+ if start_chapter is not None:
76
+ all_pages_text = [
77
+ doc[page_range].get_text("text")
78
+ for page_range in range(start_chapter, len(doc))
79
+ ]
80
+ return "\n".join(all_pages_text)
81
+ else:
82
+ warnings.warn(
83
+ "No chapter start has been detected: extracting text from the entire PDF.",
84
+ UserWarning
85
+ )
86
+ return "\n".join(page.get_text("text") for page in doc)
87
+
88
+
89
+ def process_pdf():
90
+ """
91
+ Processes a PDF file to extract text starting from the first chapter.
92
+ """
93
+ pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
94
+ if not pdf_bytes:
95
+ st.error("No PDF uploaded.")
96
+ return
97
+
98
+ with st.spinner("Processing uploaded file..."):
99
+ with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
100
+ pages_data_infos = extract_page_data_fitz(doc)
101
+ chapters_starting_page = correct_page_numbers(pages_data_infos)
102
+ full_text = extract_text(doc, chapters_starting_page)
103
+
104
+ st.session_state['full_text'] = full_text
105
+ st.session_state['pages_data_infos'] = pages_data_infos
106
+ st.session_state['chapters_starting_page'] = chapters_starting_page
107
+
108
+
109
+ def extract_toc(page_range):
110
+ """
111
+ Extracts text from specific pages in a PDF file using PyMuPDF.
112
+ This is used to extract TOC based on a given range of page numbers indicated by the user.
113
+ """
114
+ pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
115
+ if pdf_bytes is None:
116
+ st.error("No PDF uploaded.")
117
+ return ""
118
+
119
+ chapters_content_list = []
120
+ with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
121
+ for page_num in page_range:
122
+ if 0 <= page_num < len(doc):
123
+ text = doc[page_num].get_text("text")
124
+ chapters_content_list.append(text)
125
+ else:
126
+ print(f"Warning: Page number {page_num} is out of bounds.")
127
+
128
+ toc_text = "\n".join(chapters_content_list)
129
+ st.session_state["toc"] = toc_text
130
+
131
+
132
+ def extract_chapters(chapters_dict, pages_data_corrected):
133
+ """
134
+ Extract chapters from the provided JSON and pages data.
135
+ Args:
136
+ chapters_json (list): List of chapter dictionaries from the TOC.
137
+ pages_data_corrected (list): List of page data dictionaries with content.
138
+ Returns:
139
+ list: List of dictionaries, each containing chapter details and content.
140
+ """
141
+ # Initialize an empty list to hold chapter dictionaries
142
+ chapters = []
143
+
144
+ # Iterate through each chapter in the JSON
145
+ for chapter in chapters_dict:
146
+ start_page = chapter['start_page']
147
+ end_page = chapter['end_page']
148
+ chapter_text = []
149
+
150
+ # Extract content for the chapter from the pages data
151
+ for chapter_range in range(start_page-1, end_page):
152
+ chapter_text.append(pages_data_corrected[chapter_range]['content'])
153
+
154
+ chapter_text = ' '.join(chapter_text)
155
+
156
+ # Create a dictionary for the chapter
157
+ chapter_dict = {
158
+ 'chapter_number': chapter['chapter_number'],
159
+ 'chapter_title': chapter['chapter_title'],
160
+ 'start_page': start_page,
161
+ 'end_page': end_page,
162
+ 'content': chapter_text
163
+ }
164
+
165
+ chapters.append(chapter_dict)
166
+
167
+ st.session_state['chapters_extracted'] = chapters
app/backend/runpod_client.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import requests
4
+ from dotenv import load_dotenv
5
+ import json
6
+ import codecs
7
+ from pathlib import Path
8
+
9
+ # Load .env from project root
10
+ load_dotenv(dotenv_path=Path(__file__).resolve().parents[2] / ".env")
11
+
12
+ API_KEY = os.getenv("RUNPOD_API_KEY")
13
+ ENDPOINT = os.getenv("RUNPOD_ENDPOINT")
14
+
15
+ HEADERS = {
16
+ "Authorization": f"Bearer {API_KEY}",
17
+ "Content-Type": "application/json"
18
+ }
19
+
20
+
21
+ def format_messages_as_prompt(messages):
22
+ """Convert messages list to a single prompt string for the model."""
23
+ parts = []
24
+ for message in messages:
25
+ parts.append(f"{message['role'].capitalize()}: {message['content']}")
26
+ parts.append("Assistant:")
27
+ return "\n\n".join(parts)
28
+
29
+
30
+ def run_prompt(prompt: str) -> str:
31
+ """Submit a prompt to the RunPod endpoint and get back a response string."""
32
+ payload = {"input":
33
+ {"prompt": prompt}
34
+ }
35
+
36
+ # Start job
37
+ response = requests.post(f"{ENDPOINT}/run", headers=HEADERS, json=payload)
38
+ job_id = response.json().get("id")
39
+ print(f"[RunPod] Job started: {job_id}")
40
+
41
+ # Poll for status
42
+ while True:
43
+ status_res = requests.get(f"{ENDPOINT}/status/{job_id}", headers=HEADERS).json()
44
+ status = status_res.get("status")
45
+ print(f"[RunPod] Status: {status}")
46
+ if status in ("COMPLETED", "FAILED"):
47
+ break
48
+ time.sleep(3)
49
+
50
+ if status == "COMPLETED":
51
+ return status_res["output"]["response"]
52
+ else:
53
+ raise RuntimeError("RunPod job failed.")
54
+
55
+
56
+ def clean_and_parse_json(raw_text: str):
57
+ """Clean and parse model output into JSON."""
58
+ cleaned = raw_text.strip().strip("```json").strip("```").strip("'")
59
+ try:
60
+ return json.loads(cleaned)
61
+ except json.JSONDecodeError:
62
+ try:
63
+ # Handle escaped quotes
64
+ unescaped = codecs.decode(cleaned, 'unicode_escape')
65
+ return json.loads(unescaped)
66
+ except Exception as e:
67
+ raise ValueError("Could not parse JSON output") from e
68
+
app/backend/text_processing.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.tokenize import sent_tokenize
2
+ import nltk
3
+ import streamlit as st
4
+
5
+ try:
6
+ nltk.data.find("tokenizers/punkt")
7
+ except LookupError:
8
+ nltk.download("punkt")
9
+
10
+ def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
11
+ """
12
+ Creates text chunks up to max_words using sentences as undivisible units.
13
+ Each chunk can overlap with the next one by overlap_sentences.
14
+ Chunks smaller than min_words are merged with the next chunk.
15
+ """
16
+ sentences = sent_tokenize(text)
17
+ word_counts = [len(sentence.split()) for sentence in sentences]
18
+
19
+ chunks = []
20
+ i = 0
21
+
22
+ while i < len(sentences):
23
+ chunk_sentences = []
24
+ word_count = 0
25
+ chunk_start = i
26
+
27
+ # Build chunk
28
+ while i < len(sentences):
29
+ if word_count + word_counts[i] > max_words and chunk_sentences:
30
+ break
31
+ chunk_sentences.append(sentences[i])
32
+ word_count += word_counts[i]
33
+ i += 1
34
+
35
+ if chunk_sentences:
36
+ chunks.append(" ".join(chunk_sentences))
37
+
38
+ # Add overlap for next chunk
39
+ if i < len(sentences):
40
+ chunk_size = len(chunk_sentences)
41
+ overlap = min(overlap_sentences, chunk_size - 1)
42
+ i = max(i - overlap, chunk_start + 1)
43
+
44
+ # Merge small chunks with next chunk
45
+ merged_chunks = []
46
+ i = 0
47
+ while i < len(chunks):
48
+ current_chunk = chunks[i]
49
+ current_words = len(current_chunk.split())
50
+
51
+ # If current chunk is too small and there's a next chunk, merge them
52
+ if current_words < min_words and i + 1 < len(chunks):
53
+ next_chunk = chunks[i + 1]
54
+ next_words = len(next_chunk.split())
55
+
56
+ # Only merge if combined size won't be too large
57
+ if current_words + next_words <= max_words:
58
+ merged_chunk = current_chunk + " " + next_chunk
59
+ merged_chunks.append(merged_chunk)
60
+ i += 2 # Skip next chunk since we merged it
61
+ else:
62
+ # Keep small chunk as-is if merging would be too large
63
+ merged_chunks.append(current_chunk)
64
+ i += 1
65
+ else:
66
+ merged_chunks.append(current_chunk)
67
+ i += 1
68
+
69
+ # Remove chunks that are too long (likely data blocks or malformed content)
70
+ final_chunks = []
71
+ for chunk in merged_chunks:
72
+ if len(chunk.split()) <= 1000:
73
+ final_chunks.append(chunk)
74
+
75
+ return final_chunks
76
+
77
+
78
+ def chapters_chunking(chapters, max_words=500, min_words=300, overlap_sentences=5):
79
+ """
80
+ Chunk the chapters into smaller parts based on word count and overlap.
81
+
82
+ :param chapters: List of chapter dictionaries.
83
+ :param max_words: Maximum number of words per chunk.
84
+ :param min_words: Minimum number of words per chunk.
85
+ :param overlap_sentences: Number of sentences to overlap between chunks.
86
+ :return: List of dictionaries with chapter information and their respective chunks.
87
+ """
88
+ st.session_state['chapters_chunked'] = [
89
+ {
90
+ 'chapter_number': chapter['chapter_number'],
91
+ 'chapter_title': chapter['chapter_title'],
92
+ 'chunks': text_chunking(chapter['content'], max_words, min_words, overlap_sentences)
93
+ }
94
+ for chapter in chapters
95
+ ]
app/chromadb_model/1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 384,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
app/chromadb_model/README.md ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ library_name: sentence-transformers
5
+ tags:
6
+ - sentence-transformers
7
+ - feature-extraction
8
+ - sentence-similarity
9
+ - transformers
10
+ datasets:
11
+ - s2orc
12
+ - flax-sentence-embeddings/stackexchange_xml
13
+ - ms_marco
14
+ - gooaq
15
+ - yahoo_answers_topics
16
+ - code_search_net
17
+ - search_qa
18
+ - eli5
19
+ - snli
20
+ - multi_nli
21
+ - wikihow
22
+ - natural_questions
23
+ - trivia_qa
24
+ - embedding-data/sentence-compression
25
+ - embedding-data/flickr30k-captions
26
+ - embedding-data/altlex
27
+ - embedding-data/simple-wiki
28
+ - embedding-data/QQP
29
+ - embedding-data/SPECTER
30
+ - embedding-data/PAQ_pairs
31
+ - embedding-data/WikiAnswers
32
+ pipeline_tag: sentence-similarity
33
+ ---
34
+
35
+
36
+ # all-MiniLM-L6-v2
37
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
38
+
39
+ ## Usage (Sentence-Transformers)
40
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
41
+
42
+ ```
43
+ pip install -U sentence-transformers
44
+ ```
45
+
46
+ Then you can use the model like this:
47
+ ```python
48
+ from sentence_transformers import SentenceTransformer
49
+ sentences = ["This is an example sentence", "Each sentence is converted"]
50
+
51
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
52
+ embeddings = model.encode(sentences)
53
+ print(embeddings)
54
+ ```
55
+
56
+ ## Usage (HuggingFace Transformers)
57
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
58
+
59
+ ```python
60
+ from transformers import AutoTokenizer, AutoModel
61
+ import torch
62
+ import torch.nn.functional as F
63
+
64
+ #Mean Pooling - Take attention mask into account for correct averaging
65
+ def mean_pooling(model_output, attention_mask):
66
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
67
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
68
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
69
+
70
+
71
+ # Sentences we want sentence embeddings for
72
+ sentences = ['This is an example sentence', 'Each sentence is converted']
73
+
74
+ # Load model from HuggingFace Hub
75
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
76
+ model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
77
+
78
+ # Tokenize sentences
79
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
80
+
81
+ # Compute token embeddings
82
+ with torch.no_grad():
83
+ model_output = model(**encoded_input)
84
+
85
+ # Perform pooling
86
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
87
+
88
+ # Normalize embeddings
89
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
90
+
91
+ print("Sentence embeddings:")
92
+ print(sentence_embeddings)
93
+ ```
94
+
95
+ ------
96
+
97
+ ## Background
98
+
99
+ The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
100
+ contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
101
+ 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
102
+
103
+ We developed this model during the
104
+ [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
105
+ organized by Hugging Face. We developed this model as part of the project:
106
+ [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
107
+
108
+ ## Intended uses
109
+
110
+ Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
111
+ the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
112
+
113
+ By default, input text longer than 256 word pieces is truncated.
114
+
115
+
116
+ ## Training procedure
117
+
118
+ ### Pre-training
119
+
120
+ We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
121
+
122
+ ### Fine-tuning
123
+
124
+ We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
125
+ We then apply the cross entropy loss by comparing with true pairs.
126
+
127
+ #### Hyper parameters
128
+
129
+ We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
130
+ We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
131
+ a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
132
+
133
+ #### Training data
134
+
135
+ We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
136
+ We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
137
+
138
+
139
+ | Dataset | Paper | Number of training tuples |
140
+ |--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
141
+ | [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
142
+ | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
143
+ | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
144
+ | [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
145
+ | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
146
+ | [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
147
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
148
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
149
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
150
+ | [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
151
+ | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
152
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
153
+ | [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
154
+ | [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
155
+ | [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
156
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
157
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
158
+ | [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
159
+ | [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
160
+ | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
161
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
162
+ | AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
163
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
164
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
165
+ | [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
166
+ | [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
167
+ | [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
168
+ | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
169
+ | [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
170
+ | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
171
+ | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
172
+ | [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
173
+ | **Total** | | **1,170,060,424** |
app/chromadb_model/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 384,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1536,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.48.3",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
app/chromadb_model/config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "4.1.0",
4
+ "transformers": "4.48.3",
5
+ "pytorch": "2.6.0"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": "cosine"
10
+ }
app/chromadb_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1377e9af0ca0b016a9f2aa584d6fc71ab3ea6804fae21ef9fb1416e2944057ac
3
+ size 90864192
app/chromadb_model/modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
app/chromadb_model/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 256,
3
+ "do_lower_case": false
4
+ }
app/chromadb_model/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
app/chromadb_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
app/chromadb_model/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "max_length": 128,
51
+ "model_max_length": 256,
52
+ "never_split": null,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "[PAD]",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "sep_token": "[SEP]",
58
+ "stride": 0,
59
+ "strip_accents": null,
60
+ "tokenize_chinese_chars": true,
61
+ "tokenizer_class": "BertTokenizer",
62
+ "truncation_side": "right",
63
+ "truncation_strategy": "longest_first",
64
+ "unk_token": "[UNK]"
65
+ }
app/chromadb_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
app/download_questions.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from docx import Document
3
+ from io import BytesIO
4
+ from datetime import datetime
5
+
6
+
7
+ def create_docx_from_data(data):
8
+ doc = Document()
9
+ doc.add_heading("Questions", 0)
10
+ doc.add_paragraph(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
11
+
12
+ for chapter, qas in data.items():
13
+ doc.add_heading(chapter, level=1)
14
+ doc.add_paragraph("") # Spacing
15
+ for idx, qa in enumerate(qas, 1):
16
+ doc.add_paragraph(f"Q{idx}: {qa['question']}", style='List Number')
17
+ doc.add_paragraph(f"A: {qa['answer']}", style='Normal')
18
+ doc.add_paragraph("") # Spacing
19
+
20
+ buffer = BytesIO()
21
+ doc.save(buffer)
22
+ buffer.seek(0)
23
+ return buffer
app/main.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from utils import *
3
+ from main_IO import *
4
+ from download_questions import create_docx_from_data
5
+ from backend.raw_text_processing import *
6
+ from backend.chromadb_utils import *
7
+ import os
8
+ import sys
9
+ import logging
10
+
11
+
12
+ # Add the root folder (one level above 'app') to sys.path
13
+ root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
14
+ if root_path not in sys.path:
15
+ sys.path.insert(0, root_path)
16
+
17
+ # Configuration
18
+ configure_page()
19
+ initialise_session_state()
20
+ apply_style()
21
+
22
+ # add_sidebar_header()
23
+ st.sidebar.html("""
24
+ <div style='position: fixed; top: 10px; left: 20px; z-index: 999; padding: 10px;'>
25
+ <h3>Menu</h3>
26
+ </div>
27
+ """)
28
+
29
+ # Initialize chromadb variables
30
+ EMBEDDING_MODEL = "all-MiniLM-L6-v2"
31
+ model_path = "./chromadb_model"
32
+
33
+ # Set-up Logger
34
+ st.session_state.use_logger = False
35
+ if st.session_state.use_logger:
36
+ level = st.selectbox("Logging level", ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
37
+ logging.getLogger().setLevel(getattr(logging, level))
38
+
39
+ # Set default page if not specified
40
+ if "page" not in st.query_params:
41
+ st.query_params.page = "main"
42
+
43
+ # Navigation handling
44
+ if st.query_params.page == "topic":
45
+ st.switch_page("pages/2_topic_questions.py")
46
+ elif st.query_params.page == "chapter":
47
+ st.switch_page("pages/1_chapter_questions.py")
48
+ elif st.query_params.page == "inspect":
49
+ st.switch_page("pages/3_inspect_pdf.py")
50
+ else:
51
+ # Welcome message
52
+ st.title("Welcome to Text2Test!")
53
+ st.divider()
54
+ st.markdown("""
55
+ Welcome! This app helps you transform your PDFs or texts into interactive study materials by generating meaningful questions.
56
+ You can either:
57
+
58
+ - Generate questions based on specific topics or keywords
59
+ - Generate questions from a selected chapter
60
+
61
+ Start by uploading your PDF file, then choose your preferred way to generate questions using the options below.
62
+ Let’s make studying smarter and more engaging!
63
+ """)
64
+ st.divider()
65
+
66
+ # Upload PDF file
67
+ st.subheader("Upload your PDF file")
68
+ upload_pdf()
69
+ st.divider()
70
+
71
+ # Check if PDF has changed or needs processing
72
+ if st.session_state.get("pdf_changed") or (
73
+ st.session_state.get("full_text") is None and
74
+ st.session_state.get("uploaded_pdf_bytes") is not None
75
+ ):
76
+ process_pdf() # Extract text from PDF
77
+
78
+ with st.spinner("Extracting information from the text..."):
79
+ client, embedding_func = initialize_chromadb(EMBEDDING_MODEL)
80
+ whole_text_collection = initialize_collection(client, embedding_func, "whole_text_chunks")
81
+ update_collection(
82
+ whole_text_collection,
83
+ st.session_state.get("full_text"),
84
+ max_words=200,
85
+ min_words=100,
86
+ overlap_sentences=3
87
+ )
88
+ st.session_state["pdf_changed"] = False # Reset flag after processing
89
+
90
+ try:
91
+ uploaded_pdf_name = st.session_state.get('uploaded_pdf_name', None)
92
+ if uploaded_pdf_name:
93
+ st.info(f"Uploaded PDF: {uploaded_pdf_name}")
94
+ debug_log(f"book title: {uploaded_pdf_name}")
95
+ else:
96
+ pass
97
+
98
+ show_pdf_preview()
99
+
100
+ except Exception as e:
101
+ debug_log(f"Error displaying PDF info or preview: {e}")
102
+
103
+ # Main content buttons
104
+ st.subheader("Generate Questions")
105
+ st.write("Please choose an option to generate questions:")
106
+ breaks(1)
107
+ cols = st.columns(2)
108
+ st.html("""
109
+ <style>
110
+ div.stButton {
111
+ display: flex;
112
+ justify-content: center;
113
+ margin: 10px 0;
114
+ }
115
+
116
+ div.stButton > button:first-child {
117
+ width: 80%;
118
+ padding: 40px 0;
119
+ background-color: #f0f0f0 !important;
120
+ border: none !important;
121
+ border-radius: 10px !important;
122
+ color: #333 !important;
123
+ font-family: 'Work Sans', sans-serif !important;
124
+ font-weight: 600 !important;
125
+ transition: all 0.3s ease;
126
+ }
127
+
128
+ /* Target the button text directly */
129
+ div.stButton > button:first-child p,
130
+ div.stButton > button:first-child span,
131
+ div.stButton > button:first-child div,
132
+ div.stButton > button:first-child {
133
+ font-size: 24px !important;
134
+ line-height: 1.2 !important;
135
+ }
136
+
137
+ div.stButton > button:first-child:hover {
138
+ background-color: #e0e0e0 !important;
139
+ transform: translateY(-2px);
140
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1);
141
+ }
142
+ </style>
143
+ """)
144
+
145
+ with cols[0]:
146
+ if st.button("Generate Questions on a Topic", key="main_topic"):
147
+ st.query_params.page = "topic"
148
+ st.rerun()
149
+ with cols[1]:
150
+ if st.button("Generate Questions from a Chapter", key="main_chapter"):
151
+ st.query_params.page = "chapter"
152
+ st.rerun()
153
+
154
+ if st.session_state.get('questions_to_download'):
155
+ with st.sidebar:
156
+ st.markdown("---") # Divider
157
+ st.markdown("**Download Questions**") # Spacing
158
+
159
+ docx_file = create_docx_from_data(st.session_state.get('questions_to_download', {}))
160
+
161
+ st.download_button(
162
+ label="📄 Download as Word (.docx)",
163
+ data=docx_file,
164
+ file_name="questions.docx",
165
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
166
+ on_click="ignore"
167
+ )
168
+ else:
169
+ with st.sidebar:
170
+ st.markdown("---")
app/main_IO.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import fitz # PyMuPDF
3
+ from PIL import Image
4
+ import io
5
+
6
+
7
+ DEFAULT_SESSION_STATE = {
8
+ # PDF Upload
9
+ 'doc': None,
10
+ 'uploaded_pdf_name': None,
11
+ 'pdf_changed': False,
12
+ 'uploaded_pdf_bytes': None,
13
+ 'page_range_set' : False,
14
+ 'page_range_updated' : False,
15
+ 'full_text': None,
16
+ 'pages_data_infos': None,
17
+
18
+ # TOC
19
+ 'page_choice': None,
20
+ 'toc_page_range': None,
21
+ 'toc': None,
22
+
23
+ # Chapters
24
+ 'chapters_starting_page': None,
25
+ 'chapters_dict': None,
26
+ 'chapters_extracted': None,
27
+ 'chapters_chunked': None,
28
+ 'selected_chapter_idx': None,
29
+ 'selected_chapter_title': None,
30
+ 'num_questions': None,
31
+ 'chapter_selected_chunks': None,
32
+ 'chapter_prompt': None,
33
+
34
+ # Topics
35
+ 'query': None,
36
+ 'questions_ready_topic': False,
37
+
38
+ # Questions
39
+ 'questions_dict_chapter': None,
40
+ 'questions_dict_topic': None,
41
+ 'raw_output': None, # remove this (only for debug)
42
+ 'questions_ready_chapter': False,
43
+ 'questions_to_download' : {}
44
+ }
45
+
46
+
47
+ def initialise_session_state():
48
+ """
49
+ Initializes the session state variables if not already set.
50
+ """
51
+ for key, default_val in DEFAULT_SESSION_STATE.items():
52
+ if key not in st.session_state:
53
+ st.session_state[key] = default_val
54
+
55
+
56
+ def reset_session_state_on_upload():
57
+ """
58
+ Resets session state variables to their default values.
59
+ """
60
+ for key, default_val in DEFAULT_SESSION_STATE.items():
61
+ if key != 'questions_to_download':
62
+ st.session_state[key] = default_val
63
+
64
+
65
+ def upload_pdf():
66
+ uploaded_file = st.file_uploader("", type=["pdf"])
67
+
68
+ if uploaded_file is not None:
69
+ prev_file = st.session_state.get('uploaded_pdf_name')
70
+ if uploaded_file.name != prev_file:
71
+ # New file detected
72
+ reset_session_state_on_upload()
73
+ st.session_state['pdf_changed'] = True
74
+ else:
75
+ st.session_state['pdf_changed'] = False
76
+
77
+ pdf_bytes = uploaded_file.read()
78
+
79
+ if pdf_bytes:
80
+ st.session_state['uploaded_pdf_bytes'] = pdf_bytes
81
+ st.session_state['uploaded_pdf_name'] = uploaded_file.name
82
+ st.success(f"File '{uploaded_file.name}' uploaded successfully!")
83
+ else:
84
+ st.error("Uploaded file is empty!")
85
+
86
+ elif uploaded_file is None and st.session_state.get('uploaded_pdf_bytes') is not None:
87
+ st.success("File uploaded successfully!")
88
+ else:
89
+ st.info("Please upload a PDF file to proceed.")
90
+
91
+
92
+ def show_pdf_preview():
93
+ if 'uploaded_pdf_bytes' in st.session_state:
94
+ pdf_bytes = st.session_state['uploaded_pdf_bytes']
95
+ doc = None
96
+ try:
97
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
98
+ if doc.page_count < 1:
99
+ st.sidebar.error("PDF has no pages!")
100
+ return
101
+ page = doc.load_page(0)
102
+ pix = page.get_pixmap()
103
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
104
+ st.sidebar.image(img, caption="First page preview", use_container_width=True)
105
+ except Exception as e:
106
+ st.sidebar.error(f"Failed to open PDF: {e}")
107
+ finally:
108
+ if doc is not None:
109
+ doc.close()
110
+ else:
111
+ st.sidebar.write("Upload a PDF to see a preview here.")
112
+
app/pages/1_chapter_questions.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from app.utils import *
3
+ from app.main_IO import *
4
+ from app.pages.utils_chapter.display_pages import *
5
+ from app.pages.utils_chapter.display_questions import *
6
+ from app.pages.utils_chapter.chapter_extraction import *
7
+ from app.pages.utils_chapter.chapter_selection import *
8
+ from app.download_questions import create_docx_from_data
9
+
10
+ # Set up logger
11
+ if st.session_state.use_logger:
12
+ level = st.selectbox("Logging level", ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
13
+ logging.getLogger().setLevel(getattr(logging, level))
14
+ else:
15
+ logging.getLogger().setLevel(logging.CRITICAL + 1)
16
+
17
+ # Initialise
18
+ apply_style()
19
+
20
+ # add_sidebar_header()
21
+ st.sidebar.html("""
22
+ <div style='position: fixed; top: 10px; left: 20px; z-index: 999; padding: 10px;'>
23
+ <h3>Menu</h3>
24
+ </div>
25
+ """)
26
+
27
+ show_pdf_preview()
28
+ st.title("Generate Questions from a Chapter")
29
+ st.divider()
30
+ st.write("""
31
+ Here you can generate questions based on a specific chapter.
32
+ To do this, please first select the page range that includes the Table of Contents (TOC) — sometimes called the index or contents page — which lists the chapters and their page numbers.
33
+
34
+ This step is important because it helps the app automatically identify and locate chapters, so you can easily choose the exact chapter to generate questions from.
35
+ """)
36
+
37
+ # Display the page range selector
38
+ breaks(1)
39
+ display_scrollable_pages()
40
+
41
+ # UI and Interaction
42
+ set_clicked, start_page, end_page = page_range_selector_ui()
43
+
44
+ if set_clicked:
45
+ updated = handle_page_range_submission(start_page, end_page)
46
+ st.session_state["page_range_updated"] = updated
47
+
48
+ if st.session_state.get("page_range_updated", False):
49
+ extract_content_if_needed()
50
+ st.session_state["page_range_updated"] = False
51
+
52
+ # Gate rest of app
53
+ if st.session_state.get("page_range_set", False):
54
+
55
+ # Call the form in your main app code to generate questions
56
+ result = chapter_question_form()
57
+ if result:
58
+ st.session_state.questions_dict_chapter = result
59
+ debug_log(f"questions: {st.session_state.get('questions_dict_chapter', 'None')}")
60
+
61
+ if st.session_state.get("questions_ready_chapter"):
62
+ breaks(2)
63
+ st.subheader("Generated Questions")
64
+ st.divider()
65
+ # Visualize generated questions and store them
66
+ show_questions(st.session_state.get('questions_dict_chapter'))
67
+ breaks(1)
68
+ show_download_controls(st.session_state.get('selected_chapter_title'), st.session_state.get('questions_dict_chapter', 'None'))
69
+ debug_show_selected_questions()
70
+
71
+ with st.sidebar:
72
+ st.markdown("---") # Divider
73
+ st.markdown("**Download Questions**") # Spacing
74
+
75
+ docx_file = create_docx_from_data(st.session_state['questions_to_download'])
76
+
77
+ st.download_button(
78
+ label="📄 Download as Word (.docx)",
79
+ data=docx_file,
80
+ file_name="questions.docx",
81
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
82
+ on_click="ignore"
83
+ )
84
+
85
+ else:
86
+ st.info("Please set a valid page range to continue.")
87
+
88
+
89
+
90
+
app/pages/2_topic_questions.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import chromadb
3
+ from app.utils import *
4
+ from app.main_IO import *
5
+ from app.download_questions import create_docx_from_data
6
+ from app.pages.utils_chapter.display_questions import *
7
+ from app.pages.utils_chapter.chapter_selection import select_num_questions
8
+ from app.backend.chunks_processing import query_collection
9
+ from app.backend.messages_templates import book_prompt
10
+ from app.backend.runpod_client import run_prompt, clean_and_parse_json
11
+
12
+
13
+ # Initialise
14
+ apply_style()
15
+
16
+ # add_sidebar_header()
17
+ st.sidebar.html("""
18
+ <div style='position: fixed; top: 10px; left: 20px; z-index: 999; padding: 10px;'>
19
+ <h3>Menu</h3>
20
+ </div>
21
+ """)
22
+
23
+ show_pdf_preview()
24
+ st.title("Generate Questions on a Topic")
25
+ st.divider()
26
+ st.write("""Here, you can generate questions based on a specific topic.
27
+ You can enter a topic or keyword, and the app will generate questions based on the content of the uploaded PDF.""")
28
+
29
+ breaks(1)
30
+
31
+ # Set up logger
32
+ if st.session_state.use_logger:
33
+ level = st.selectbox("Logging level", ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
34
+ logging.getLogger().setLevel(getattr(logging, level))
35
+
36
+ if st.session_state.get("full_text", None) is not None:
37
+ client = chromadb.Client() # Use same client init/config
38
+ whole_text_collection = client.get_collection("whole_text_chunks")
39
+ debug_log(f"Collection name: {whole_text_collection.name}")
40
+ debug_log(f"Number of documents: {whole_text_collection.count()}")
41
+ results = whole_text_collection.get(limit=1)
42
+
43
+ documents = results['documents'] # List of text chunks
44
+ metadatas = results['metadatas'] # List of metadata dicts, e.g. chunk indexes
45
+
46
+ for i, (doc, meta) in enumerate(zip(documents, metadatas)):
47
+ debug_log(f"Chunk {i}:")
48
+ debug_log(doc)
49
+ debug_log(f"Metadata: {meta}")
50
+
51
+
52
+ with st.form("query_form"):
53
+ st.subheader("Enter a Topic or Keyword")
54
+ query = st.text_input("Enter your query:")
55
+ col1, _ = st.columns([2, 6])
56
+ with col1:
57
+ num_questions = select_num_questions()
58
+ breaks(1)
59
+ submitted = st.form_submit_button("Submit")
60
+
61
+ if submitted and query:
62
+ with st.spinner("Generating questions..."):
63
+ # Generate questions based on the query
64
+ query_context = query_collection(whole_text_collection, query=query, nresults=3, context_multiplier=2)
65
+ prompt = book_prompt(query_context, num_questions=num_questions, user_query=query)
66
+ questions_json = run_prompt(prompt)
67
+ st.session_state.questions_dict_topic = clean_and_parse_json(questions_json)
68
+ st.session_state['query'] = query
69
+ st.session_state['questions_ready_topic'] = True
70
+
71
+ if st.session_state.get("questions_ready_topic"):
72
+ breaks(2)
73
+ st.subheader("Generated Questions")
74
+ st.divider()
75
+ debug_log(f"Generated questions: {st.session_state.get('questions_dict_topic', 'None')}")
76
+
77
+ # Visualize generated questions and store them
78
+ show_questions(st.session_state['questions_dict_topic'])
79
+ breaks(1)
80
+ show_download_controls(st.session_state.get('query'), st.session_state.get('questions_dict_topic', 'None'))
81
+ debug_show_selected_questions()
82
+
83
+ with st.sidebar:
84
+ st.divider() # Divider
85
+ st.markdown("**Download Questions**") # Spacing
86
+
87
+ docx_file = create_docx_from_data(st.session_state.get('questions_to_download', {}))
88
+
89
+ st.download_button(
90
+ label="📄 Download as Word (.docx)",
91
+ data=docx_file,
92
+ file_name="questions.docx",
93
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
94
+ on_click="ignore"
95
+ )
96
+
97
+
98
+
99
+
100
+ # query_context = query_collection(whole_text_collection, query=query, nresults=3, context_multiplier=2)
101
+ # out3 = book_prompt(query_context, num_questions=3, user_query=query)
102
+ # questions = run_prompt(out)
103
+
104
+
105
+ # use https://docs.streamlit.io/develop/api-reference/chat/st.chat_input
106
+ # or https://docs.streamlit.io/develop/api-reference/widgets/st.text_input
app/pages/3_inspect_pdf.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from app.utils import *
3
+
4
+ # Initialise
5
+ apply_style()
6
+
7
+ st.title("Work in Progress: Inspect PDF")
8
+
app/pages/__init__.py ADDED
File without changes
app/pages/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (161 Bytes). View file
 
app/pages/__pycache__/page1_utils.cpython-312.pyc ADDED
Binary file (4.7 kB). View file
 
app/pages/utils_chapter/__init__.py ADDED
File without changes
app/pages/utils_chapter/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (175 Bytes). View file
 
app/pages/utils_chapter/__pycache__/chapter_extraction.cpython-312.pyc ADDED
Binary file (5.1 kB). View file
 
app/pages/utils_chapter/__pycache__/chapter_selection.cpython-312.pyc ADDED
Binary file (4.56 kB). View file
 
app/pages/utils_chapter/__pycache__/display_pages.cpython-312.pyc ADDED
Binary file (5.76 kB). View file
 
app/pages/utils_chapter/__pycache__/display_questions.cpython-312.pyc ADDED
Binary file (5.75 kB). View file
 
app/pages/utils_chapter/__pycache__/download_questions.cpython-312.pyc ADDED
Binary file (1.46 kB). View file
 
app/pages/utils_chapter/__pycache__/page1_utils.cpython-312.pyc ADDED
Binary file (6.09 kB). View file