Spaces:
Sleeping
Sleeping
Davide Panza
commited on
Upload 56 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app/__init__.py +0 -0
- app/__pycache__/__init__.cpython-312.pyc +0 -0
- app/__pycache__/download_questions.cpython-312.pyc +0 -0
- app/__pycache__/main_IO.cpython-312.pyc +0 -0
- app/__pycache__/utils.cpython-312.pyc +0 -0
- app/backend/__init__.py +0 -0
- app/backend/__pycache__/__init__.cpython-312.pyc +0 -0
- app/backend/__pycache__/chromadb_utils.cpython-312.pyc +0 -0
- app/backend/__pycache__/chunks_processing.cpython-312.pyc +0 -0
- app/backend/__pycache__/get_requests.cpython-312.pyc +0 -0
- app/backend/__pycache__/messages_templates.cpython-312.pyc +0 -0
- app/backend/__pycache__/raw_text_processing.cpython-312.pyc +0 -0
- app/backend/__pycache__/runpod_client.cpython-312.pyc +0 -0
- app/backend/__pycache__/text_processing.cpython-312.pyc +0 -0
- app/backend/__pycache__/toc_parser.cpython-312.pyc +0 -0
- app/backend/chromadb_utils.py +61 -0
- app/backend/chunks_processing.py +32 -0
- app/backend/get_requests.py +37 -0
- app/backend/messages_templates.py +265 -0
- app/backend/raw_text_processing.py +167 -0
- app/backend/runpod_client.py +68 -0
- app/backend/text_processing.py +95 -0
- app/chromadb_model/1_Pooling/config.json +10 -0
- app/chromadb_model/README.md +173 -0
- app/chromadb_model/config.json +26 -0
- app/chromadb_model/config_sentence_transformers.json +10 -0
- app/chromadb_model/model.safetensors +3 -0
- app/chromadb_model/modules.json +20 -0
- app/chromadb_model/sentence_bert_config.json +4 -0
- app/chromadb_model/special_tokens_map.json +37 -0
- app/chromadb_model/tokenizer.json +0 -0
- app/chromadb_model/tokenizer_config.json +65 -0
- app/chromadb_model/vocab.txt +0 -0
- app/download_questions.py +23 -0
- app/main.py +170 -0
- app/main_IO.py +112 -0
- app/pages/1_chapter_questions.py +90 -0
- app/pages/2_topic_questions.py +106 -0
- app/pages/3_inspect_pdf.py +8 -0
- app/pages/__init__.py +0 -0
- app/pages/__pycache__/__init__.cpython-312.pyc +0 -0
- app/pages/__pycache__/page1_utils.cpython-312.pyc +0 -0
- app/pages/utils_chapter/__init__.py +0 -0
- app/pages/utils_chapter/__pycache__/__init__.cpython-312.pyc +0 -0
- app/pages/utils_chapter/__pycache__/chapter_extraction.cpython-312.pyc +0 -0
- app/pages/utils_chapter/__pycache__/chapter_selection.cpython-312.pyc +0 -0
- app/pages/utils_chapter/__pycache__/display_pages.cpython-312.pyc +0 -0
- app/pages/utils_chapter/__pycache__/display_questions.cpython-312.pyc +0 -0
- app/pages/utils_chapter/__pycache__/download_questions.cpython-312.pyc +0 -0
- app/pages/utils_chapter/__pycache__/page1_utils.cpython-312.pyc +0 -0
app/__init__.py
ADDED
|
File without changes
|
app/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (155 Bytes). View file
|
|
|
app/__pycache__/download_questions.cpython-312.pyc
ADDED
|
Binary file (1.44 kB). View file
|
|
|
app/__pycache__/main_IO.cpython-312.pyc
ADDED
|
Binary file (4.73 kB). View file
|
|
|
app/__pycache__/utils.cpython-312.pyc
ADDED
|
Binary file (2.64 kB). View file
|
|
|
app/backend/__init__.py
ADDED
|
File without changes
|
app/backend/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (163 Bytes). View file
|
|
|
app/backend/__pycache__/chromadb_utils.cpython-312.pyc
ADDED
|
Binary file (2.32 kB). View file
|
|
|
app/backend/__pycache__/chunks_processing.cpython-312.pyc
ADDED
|
Binary file (2.05 kB). View file
|
|
|
app/backend/__pycache__/get_requests.cpython-312.pyc
ADDED
|
Binary file (1.95 kB). View file
|
|
|
app/backend/__pycache__/messages_templates.cpython-312.pyc
ADDED
|
Binary file (8.35 kB). View file
|
|
|
app/backend/__pycache__/raw_text_processing.cpython-312.pyc
ADDED
|
Binary file (8.14 kB). View file
|
|
|
app/backend/__pycache__/runpod_client.cpython-312.pyc
ADDED
|
Binary file (3.37 kB). View file
|
|
|
app/backend/__pycache__/text_processing.cpython-312.pyc
ADDED
|
Binary file (3.7 kB). View file
|
|
|
app/backend/__pycache__/toc_parser.cpython-312.pyc
ADDED
|
Binary file (821 Bytes). View file
|
|
|
app/backend/chromadb_utils.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import chromadb
|
| 2 |
+
from chromadb.utils import embedding_functions
|
| 3 |
+
from .text_processing import text_chunking
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def initialize_chromadb(EMBEDDING_MODEL, local_model_path=None):
|
| 7 |
+
"""
|
| 8 |
+
Initialize ChromaDB client and embedding function, using a local model path if provided.
|
| 9 |
+
"""
|
| 10 |
+
client = chromadb.Client()
|
| 11 |
+
|
| 12 |
+
if local_model_path:
|
| 13 |
+
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 14 |
+
model_name=local_model_path
|
| 15 |
+
)
|
| 16 |
+
else:
|
| 17 |
+
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 18 |
+
model_name=EMBEDDING_MODEL
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
return client, embedding_func
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def initialize_collection(client, embedding_func, collection_name):
|
| 25 |
+
"""
|
| 26 |
+
Initialize a collection in ChromaDB.
|
| 27 |
+
"""
|
| 28 |
+
collection = client.get_or_create_collection(
|
| 29 |
+
name=collection_name,
|
| 30 |
+
embedding_function=embedding_func,
|
| 31 |
+
metadata={"hnsw:space": "cosine"},
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
return collection
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def update_collection(
|
| 38 |
+
collection,
|
| 39 |
+
text,
|
| 40 |
+
max_words=200,
|
| 41 |
+
min_words=100,
|
| 42 |
+
overlap_sentences=3,
|
| 43 |
+
):
|
| 44 |
+
"""
|
| 45 |
+
Update the ChromaDB collection with text chunks.
|
| 46 |
+
Args:
|
| 47 |
+
collection: ChromaDB collection object.
|
| 48 |
+
text (str): The text to be chunked and added.
|
| 49 |
+
max_words (int): Maximum number of words per chunk.
|
| 50 |
+
min_words (int): Minimum number of words per chunk.
|
| 51 |
+
overlap_sentences (int): Number of sentences to overlap between chunks.
|
| 52 |
+
Returns:
|
| 53 |
+
None
|
| 54 |
+
"""
|
| 55 |
+
chunks = text_chunking(text, max_words=max_words, min_words=min_words, overlap_sentences=overlap_sentences)
|
| 56 |
+
collection.add(
|
| 57 |
+
documents=chunks,
|
| 58 |
+
ids=[f"chunk_{j:04d}" for j in range(len(chunks))],
|
| 59 |
+
metadatas=[{"chunk_index": j} for j in range(len(chunks))]
|
| 60 |
+
)
|
| 61 |
+
|
app/backend/chunks_processing.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import streamlit as st
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def query_collection(collection, query='', nresults=3, context_multiplier=2, sim_th=None):
|
| 6 |
+
"""Get relevant text from a collection for a given query"""
|
| 7 |
+
|
| 8 |
+
query_result = collection.query(query_texts=query, n_results=nresults*context_multiplier)
|
| 9 |
+
docs = query_result.get('documents')[0]
|
| 10 |
+
|
| 11 |
+
if sim_th is not None:
|
| 12 |
+
similarities = [1 - d for d in query_result.get("distances")[0]]
|
| 13 |
+
relevant_docs = [d for d, s in zip(docs, similarities) if s >= sim_th]
|
| 14 |
+
return ''.join(relevant_docs)
|
| 15 |
+
return docs
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_chapter_context(chapters, chapter_number, n_questions):
|
| 19 |
+
chapter = chapters[chapter_number]
|
| 20 |
+
print(chapter.keys())
|
| 21 |
+
if chapter is None:
|
| 22 |
+
raise ValueError(f"Chapter {chapter_number} not found in the chapters list.")
|
| 23 |
+
if 'chunks' not in chapter:
|
| 24 |
+
raise ValueError(f"Chapter {chapter_number} does not contain 'text' key.")
|
| 25 |
+
|
| 26 |
+
n_chunks = len(chapter['chunks'])
|
| 27 |
+
if n_chunks == 0:
|
| 28 |
+
raise ValueError(f"Chapter {chapter_number} has no chunks to process.")
|
| 29 |
+
|
| 30 |
+
chunks_indices = random.sample(range(n_chunks), min(n_questions, n_chunks))
|
| 31 |
+
st.session_state['chapter_selected_chunks'] = [chapter['chunks'][i] for i in chunks_indices]
|
| 32 |
+
|
app/backend/get_requests.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.backend.runpod_client import format_messages_as_prompt, run_prompt, clean_and_parse_json
|
| 2 |
+
from app.backend.messages_templates import toc_prompt, chapter_prompt, chapter_prompt_edgecase
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def extract_chapters_from_toc(toc_text: str):
|
| 7 |
+
prompt = toc_prompt(toc_text)
|
| 8 |
+
# prompt = format_messages_as_prompt(messages) get rid of this
|
| 9 |
+
print("use prompt optimized for gemma3")
|
| 10 |
+
raw_output = run_prompt(prompt)
|
| 11 |
+
st.session_state['chapters_dict'] = clean_and_parse_json(raw_output)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def generate_questions_from_chapter(chunks, num_questions, max_questions=5):
|
| 15 |
+
prompt = chapter_prompt(contexts=chunks, num_questions=num_questions, max_questions=max_questions)
|
| 16 |
+
# prompt = format_messages_as_prompt(messages) get rid of this
|
| 17 |
+
print("use prompt optimized for gemma3")
|
| 18 |
+
raw_output = run_prompt(prompt)
|
| 19 |
+
try:
|
| 20 |
+
generated_questions = clean_and_parse_json(raw_output)
|
| 21 |
+
st.success("Questions generated successfully!")
|
| 22 |
+
return generated_questions
|
| 23 |
+
except:
|
| 24 |
+
print("Error parsing JSON")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def generate_questions_from_chapter_edgecase(chunks, num_questions, max_questions=5):
|
| 28 |
+
prompt = chapter_prompt_edgecase(grouped_chunks=chunks, num_questions=num_questions, max_questions=max_questions)
|
| 29 |
+
# prompt = format_messages_as_prompt(messages) get rid of this
|
| 30 |
+
print("use prompt optimized for gemma3")
|
| 31 |
+
raw_output = run_prompt(prompt)
|
| 32 |
+
try:
|
| 33 |
+
generated_questions = clean_and_parse_json(raw_output)
|
| 34 |
+
st.success("Questions generated successfully!")
|
| 35 |
+
return generated_questions
|
| 36 |
+
except:
|
| 37 |
+
print("Error parsing JSON")
|
app/backend/messages_templates.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# def get_toc_extraction_messages(toc_text: str):
|
| 2 |
+
# return [
|
| 3 |
+
# {
|
| 4 |
+
# "role": "system",
|
| 5 |
+
# "content": "You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text. If you cannot find clear chapter information, you return an empty array. You do not guess chapter titles or page numbers."
|
| 6 |
+
# },
|
| 7 |
+
# {
|
| 8 |
+
# "role": "user",
|
| 9 |
+
# "content": "I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections. Do not make up any information."
|
| 10 |
+
# },
|
| 11 |
+
# {
|
| 12 |
+
# "role": "assistant",
|
| 13 |
+
# "content": "I understand. I will extract ONLY the main chapters that are explicitly shown in your table of contents. I will not invent, guess, or hallucinate any chapter titles or page numbers. I will only use the exact information present in the document."
|
| 14 |
+
# },
|
| 15 |
+
# {
|
| 16 |
+
# "role": "user",
|
| 17 |
+
# "content": f"""Here is the table of contents:
|
| 18 |
+
|
| 19 |
+
# {toc_text}
|
| 20 |
+
|
| 21 |
+
# WARNING: DO NOT HALLUCINATE OR INVENT INFORMATION
|
| 22 |
+
# - Do NOT make up chapter titles like "Probability", "Statistical Inference", "Linear Regression"
|
| 23 |
+
# - Do NOT guess page numbers
|
| 24 |
+
# - Do NOT create generic textbook chapters
|
| 25 |
+
# - ONLY extract what you can clearly see in the provided text
|
| 26 |
+
|
| 27 |
+
# CRITICAL RULES:
|
| 28 |
+
# 1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
|
| 29 |
+
# 2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
|
| 30 |
+
# 3. Use the EXACT chapter titles shown in the document
|
| 31 |
+
# 4. Use the EXACT page numbers shown in the document
|
| 32 |
+
# 5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
|
| 33 |
+
# 6. Calculate end pages as: next chapter's start page minus 1
|
| 34 |
+
# 7. Return ONLY valid JSON - no explanations, no markdown formatting
|
| 35 |
+
# 8. If you cannot clearly identify chapters, return empty array []
|
| 36 |
+
|
| 37 |
+
# Look for patterns like:
|
| 38 |
+
# - "1 Probability Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1"
|
| 39 |
+
# - "2 Distribution Theory and Statistical Models . . . . . . . . . . . . . . . . 155"
|
| 40 |
+
# - "3 Basic Statistical Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 205"
|
| 41 |
+
|
| 42 |
+
# DO NOT extract lines like:
|
| 43 |
+
# - "1.1 Some Important Music Concepts . . . . . . . . . . . 3"
|
| 44 |
+
# - "Preface . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v"
|
| 45 |
+
|
| 46 |
+
# Use ONLY the exact titles from the document. Do not shorten or modify them.
|
| 47 |
+
|
| 48 |
+
# Return JSON array: [{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X}}]
|
| 49 |
+
|
| 50 |
+
# REMEMBER: Extract only what is explicitly visible in the text. Do not hallucinate. Be complete and extract all chapters that are clearly numbered. y chapters, return an empty array []."""
|
| 51 |
+
# },
|
| 52 |
+
# {
|
| 53 |
+
# "role": "assistant",
|
| 54 |
+
# "content": "I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers. I will not invent or hallucinate any information."
|
| 55 |
+
# }
|
| 56 |
+
# ]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def toc_prompt(toc_text: str):
|
| 60 |
+
# Convert to Gemma 3 format - single string with proper turn markers
|
| 61 |
+
prompt = f"""<start_of_turn>user
|
| 62 |
+
You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text. If you cannot find clear chapter information, you return an empty array.
|
| 63 |
+
|
| 64 |
+
I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections. Do not make up any information.
|
| 65 |
+
|
| 66 |
+
Here is the table of contents:
|
| 67 |
+
|
| 68 |
+
{toc_text}
|
| 69 |
+
|
| 70 |
+
WARNING: DO NOT HALLUCINATE OR INVENT INFORMATION
|
| 71 |
+
- Do NOT make up chapter titles like "Probability", "Statistical Inference", "Linear Regression"
|
| 72 |
+
- Do NOT guess page numbers
|
| 73 |
+
- Do NOT create generic textbook chapters
|
| 74 |
+
- ONLY extract what you can clearly see in the provided text
|
| 75 |
+
|
| 76 |
+
CRITICAL RULES:
|
| 77 |
+
1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
|
| 78 |
+
2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
|
| 79 |
+
3. Use the EXACT chapter titles shown in the document
|
| 80 |
+
4. Use the EXACT page numbers shown in the document
|
| 81 |
+
5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
|
| 82 |
+
6. Calculate end pages as: next chapter's start page minus 1
|
| 83 |
+
7. Return ONLY valid JSON - no explanations, no markdown formatting
|
| 84 |
+
8. If you cannot clearly identify chapters, return empty array []
|
| 85 |
+
|
| 86 |
+
Look for patterns like:
|
| 87 |
+
- "1 Probability Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1"
|
| 88 |
+
- "2 Distribution Theory and Statistical Models . . . . . . . . . . . . . . . . 155"
|
| 89 |
+
- "3 Basic Statistical Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 205"
|
| 90 |
+
|
| 91 |
+
DO NOT extract lines like:
|
| 92 |
+
- "1.1 Some Important Music Concepts . . . . . . . . . . . 3"
|
| 93 |
+
- "Preface . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v"
|
| 94 |
+
|
| 95 |
+
Use ONLY the exact titles from the document. Do not shorten or modify them.
|
| 96 |
+
|
| 97 |
+
Return JSON array: [{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X}}]
|
| 98 |
+
|
| 99 |
+
Extract only what is explicitly visible in the text. Do not hallucinate. Be complete and extract all chapters that are clearly numbered. If no clear main chapters, return an empty array [].<end_of_turn>
|
| 100 |
+
<start_of_turn>model
|
| 101 |
+
I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers. I will not invent or hallucinate any information.
|
| 102 |
+
|
| 103 |
+
Looking at the provided table of contents, I will now extract the main chapters:<end_of_turn>
|
| 104 |
+
<start_of_turn>user
|
| 105 |
+
Perfect. Now provide the JSON array with the extracted chapters.<end_of_turn>
|
| 106 |
+
<start_of_turn>model
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
return prompt
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def chapter_prompt(contexts, num_questions, max_questions=5):
|
| 113 |
+
"""
|
| 114 |
+
Create a prompt formatted for Gemma 3 12B-IT model.
|
| 115 |
+
This prompt is designed to generate diverse questions based on provided text contexts.
|
| 116 |
+
Args:
|
| 117 |
+
contexts (list): List of text contexts to base questions on.
|
| 118 |
+
num_questions (int): Number of questions to generate.
|
| 119 |
+
max_questions (int): Maximum number of questions allowed.
|
| 120 |
+
Returns:
|
| 121 |
+
str: Formatted prompt string for Gemma 3 model.
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
# Gemma uses special tokens for instruction tuning
|
| 125 |
+
prompt = """<start_of_turn>user
|
| 126 |
+
You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
|
| 127 |
+
|
| 128 |
+
IMPORTANT REQUIREMENTS:
|
| 129 |
+
1. Output MUST be valid JSON format
|
| 130 |
+
2. Generate EXACTLY {num_questions} questions
|
| 131 |
+
3. Each question must have a complete answer from the contexts
|
| 132 |
+
4. Vary question types (what, why, how, when, explain, compare)
|
| 133 |
+
5. Do not generate yes/no questions
|
| 134 |
+
6. Answers should be 1-3 sentences long
|
| 135 |
+
|
| 136 |
+
CONTEXTS:
|
| 137 |
+
{contexts}
|
| 138 |
+
|
| 139 |
+
OUTPUT FORMAT - Return ONLY valid JSON array:
|
| 140 |
+
[
|
| 141 |
+
{{"question": "Your question here?", "answer": "Complete answer from the context"}},
|
| 142 |
+
{{"question": "Another question?", "answer": "Another answer"}}
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
Generate the questions now:<end_of_turn>
|
| 146 |
+
<start_of_turn>model
|
| 147 |
+
""".format(
|
| 148 |
+
num_questions=min(num_questions, max_questions),
|
| 149 |
+
contexts=format_contexts(contexts)
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
return prompt
|
| 153 |
+
|
| 154 |
+
def chapter_prompt_edgecase(grouped_chunks, num_questions, max_questions=5):
|
| 155 |
+
"""
|
| 156 |
+
Create a prompt formatted for Gemma 3 12B-IT model.
|
| 157 |
+
This prompt is designed to handle edge cases where contexts retrieved are less than the number of questions requested.
|
| 158 |
+
Args:
|
| 159 |
+
contexts (list): List of text contexts to format.
|
| 160 |
+
Returns:
|
| 161 |
+
str: Formatted string of contexts.
|
| 162 |
+
"""
|
| 163 |
+
|
| 164 |
+
prompt = """<start_of_turn>user
|
| 165 |
+
Generate {num_questions} questions from the following contexts. You may:
|
| 166 |
+
- Generate one or more questions from each context
|
| 167 |
+
- Use multiple contexts for a single question
|
| 168 |
+
- Skip contexts if they don't contain meaningful information
|
| 169 |
+
|
| 170 |
+
REQUIREMENTS:
|
| 171 |
+
1. Output valid JSON array format
|
| 172 |
+
2. Generate EXACTLY {num_questions} questions
|
| 173 |
+
3. Each answer must be found in the provided contexts
|
| 174 |
+
4. Create diverse question types
|
| 175 |
+
5. Reference which context group(s) you used
|
| 176 |
+
|
| 177 |
+
CONTEXT GROUPS:
|
| 178 |
+
{context_groups}
|
| 179 |
+
|
| 180 |
+
OUTPUT FORMAT - Return ONLY this JSON structure:
|
| 181 |
+
[
|
| 182 |
+
{{"question": "Question text?", "answer": "Answer text", "context_used": [1, 2]}},
|
| 183 |
+
{{"question": "Question text?", "answer": "Answer text", "context_used": [1]}}
|
| 184 |
+
]
|
| 185 |
+
|
| 186 |
+
Generate the questions:<end_of_turn>
|
| 187 |
+
<start_of_turn>model
|
| 188 |
+
""".format(
|
| 189 |
+
num_questions=min(num_questions, max_questions),
|
| 190 |
+
context_groups=format_contexts(grouped_chunks)
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
return prompt
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def book_prompt(contexts, num_questions, user_query=None, max_questions=5):
|
| 197 |
+
"""
|
| 198 |
+
Create a prompt formatted for Gemma 3 12B-IT model with topic awareness.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
contexts (list): List of text contexts retrieved based on user query
|
| 202 |
+
num_questions (int): Number of questions to generate
|
| 203 |
+
user_query (str): The original user query/topic
|
| 204 |
+
max_questions (int): Maximum number of questions allowed
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
str: Formatted prompt string for Gemma 3 model
|
| 208 |
+
"""
|
| 209 |
+
|
| 210 |
+
num_questions = min(num_questions, max_questions)
|
| 211 |
+
|
| 212 |
+
# Build topic context section if query provided
|
| 213 |
+
topic_context = ""
|
| 214 |
+
if user_query:
|
| 215 |
+
topic_context = f"""
|
| 216 |
+
TOPIC FOCUS: {user_query}
|
| 217 |
+
The following contexts were retrieved based on this topic. Generate questions that:
|
| 218 |
+
- Relate to the main topic: "{user_query}"
|
| 219 |
+
- Explore different aspects of this topic found in the contexts
|
| 220 |
+
- Connect the topic to broader concepts when relevant
|
| 221 |
+
|
| 222 |
+
"""
|
| 223 |
+
|
| 224 |
+
prompt = """<start_of_turn>user
|
| 225 |
+
You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
|
| 226 |
+
{topic_context}
|
| 227 |
+
IMPORTANT REQUIREMENTS:
|
| 228 |
+
1. Output MUST be valid JSON format
|
| 229 |
+
2. Generate EXACTLY {num_questions} questions
|
| 230 |
+
3. Each question must have a complete answer from the contexts
|
| 231 |
+
4. Vary question types (what, why, how, when, explain, compare)
|
| 232 |
+
5. Do not generate yes/no questions
|
| 233 |
+
6. Answers should be 1-3 sentences long
|
| 234 |
+
7. Questions should explore different aspects of the topic
|
| 235 |
+
|
| 236 |
+
CONTEXTS (Retrieved based on topic: "{query}"):
|
| 237 |
+
{contexts}
|
| 238 |
+
|
| 239 |
+
OUTPUT FORMAT - Return ONLY valid JSON array:
|
| 240 |
+
[
|
| 241 |
+
{{"question": "Your question here?", "answer": "Complete answer from the context"}},
|
| 242 |
+
{{"question": "Another question?", "answer": "Another answer"}}
|
| 243 |
+
]
|
| 244 |
+
|
| 245 |
+
Generate the questions now:<end_of_turn>
|
| 246 |
+
<start_of_turn>model
|
| 247 |
+
""".format(
|
| 248 |
+
num_questions=num_questions,
|
| 249 |
+
topic_context=topic_context,
|
| 250 |
+
query=user_query if user_query else "the provided content",
|
| 251 |
+
contexts=format_contexts(contexts)
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
return prompt
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def format_contexts(contexts):
|
| 258 |
+
"""
|
| 259 |
+
Format contexts for better readability.
|
| 260 |
+
"""
|
| 261 |
+
formatted = ""
|
| 262 |
+
for i, context in enumerate(contexts, 1):
|
| 263 |
+
formatted += f"Context {i}:\n{context.strip()}\n\n"
|
| 264 |
+
return formatted.strip()
|
| 265 |
+
|
app/backend/raw_text_processing.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import warnings
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def extract_page_data_fitz(doc):
|
| 7 |
+
"""
|
| 8 |
+
Extracts page numbers and text from a PDF file using PyMuPDF.
|
| 9 |
+
The function looks for page numbers in the top and bottom 15% of each page.
|
| 10 |
+
It returns a list of dictionaries, each containing the page index, page number,
|
| 11 |
+
and the full text of the page.
|
| 12 |
+
"""
|
| 13 |
+
pages_data = []
|
| 14 |
+
|
| 15 |
+
for i, page in enumerate(doc):
|
| 16 |
+
height = page.rect.height
|
| 17 |
+
width = page.rect.width
|
| 18 |
+
|
| 19 |
+
top_rect = fitz.Rect(0, 0, width, height * 0.15)
|
| 20 |
+
bottom_rect = fitz.Rect(0, height * 0.85, width, height)
|
| 21 |
+
|
| 22 |
+
top_text = page.get_text("text", clip=top_rect).split()
|
| 23 |
+
bottom_text = page.get_text("text", clip=bottom_rect).split()
|
| 24 |
+
|
| 25 |
+
found_number = next((int(text) for text in top_text + bottom_text if text.isdigit()), None)
|
| 26 |
+
full_text = page.get_text("text")
|
| 27 |
+
|
| 28 |
+
pages_data.append({
|
| 29 |
+
"index": i,
|
| 30 |
+
"number": found_number,
|
| 31 |
+
"content": full_text
|
| 32 |
+
})
|
| 33 |
+
|
| 34 |
+
return pages_data
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def correct_page_numbers(pages_data, sequence_length=10):
|
| 38 |
+
"""
|
| 39 |
+
Corrects page numbers by finding the first sequence of consecutive values,
|
| 40 |
+
filling gaps forward and backward, and setting values < 1 to None.
|
| 41 |
+
Returns the index of the first page numbered 1, or None if no sequence is found.
|
| 42 |
+
"""
|
| 43 |
+
try:
|
| 44 |
+
seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]
|
| 45 |
+
|
| 46 |
+
for start in range(len(seen) - sequence_length + 1):
|
| 47 |
+
if all(seen[start + j][1] == seen[start][1] + j for j in range(sequence_length)):
|
| 48 |
+
base_index, base_number = seen[start]
|
| 49 |
+
break
|
| 50 |
+
else:
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
for offset, page in enumerate(pages_data[base_index:], start=0):
|
| 54 |
+
page["number"] = base_number + offset
|
| 55 |
+
|
| 56 |
+
for offset in range(1, base_index + 1):
|
| 57 |
+
page = pages_data[base_index - offset]
|
| 58 |
+
page["number"] = base_number - offset
|
| 59 |
+
|
| 60 |
+
for page in pages_data:
|
| 61 |
+
if page["number"] < 1:
|
| 62 |
+
page["number"] = None
|
| 63 |
+
|
| 64 |
+
return next((page['index'] for page in pages_data if page["number"] == 1), None)
|
| 65 |
+
|
| 66 |
+
except Exception:
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def extract_text(doc, start_chapter=None):
|
| 71 |
+
"""
|
| 72 |
+
Extracts the text of the book starting from the specified page index.
|
| 73 |
+
If no start_chapter is provided, it returns the whole doc.
|
| 74 |
+
"""
|
| 75 |
+
if start_chapter is not None:
|
| 76 |
+
all_pages_text = [
|
| 77 |
+
doc[page_range].get_text("text")
|
| 78 |
+
for page_range in range(start_chapter, len(doc))
|
| 79 |
+
]
|
| 80 |
+
return "\n".join(all_pages_text)
|
| 81 |
+
else:
|
| 82 |
+
warnings.warn(
|
| 83 |
+
"No chapter start has been detected: extracting text from the entire PDF.",
|
| 84 |
+
UserWarning
|
| 85 |
+
)
|
| 86 |
+
return "\n".join(page.get_text("text") for page in doc)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def process_pdf():
|
| 90 |
+
"""
|
| 91 |
+
Processes a PDF file to extract text starting from the first chapter.
|
| 92 |
+
"""
|
| 93 |
+
pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
|
| 94 |
+
if not pdf_bytes:
|
| 95 |
+
st.error("No PDF uploaded.")
|
| 96 |
+
return
|
| 97 |
+
|
| 98 |
+
with st.spinner("Processing uploaded file..."):
|
| 99 |
+
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
|
| 100 |
+
pages_data_infos = extract_page_data_fitz(doc)
|
| 101 |
+
chapters_starting_page = correct_page_numbers(pages_data_infos)
|
| 102 |
+
full_text = extract_text(doc, chapters_starting_page)
|
| 103 |
+
|
| 104 |
+
st.session_state['full_text'] = full_text
|
| 105 |
+
st.session_state['pages_data_infos'] = pages_data_infos
|
| 106 |
+
st.session_state['chapters_starting_page'] = chapters_starting_page
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def extract_toc(page_range):
|
| 110 |
+
"""
|
| 111 |
+
Extracts text from specific pages in a PDF file using PyMuPDF.
|
| 112 |
+
This is used to extract TOC based on a given range of page numbers indicated by the user.
|
| 113 |
+
"""
|
| 114 |
+
pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
|
| 115 |
+
if pdf_bytes is None:
|
| 116 |
+
st.error("No PDF uploaded.")
|
| 117 |
+
return ""
|
| 118 |
+
|
| 119 |
+
chapters_content_list = []
|
| 120 |
+
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
|
| 121 |
+
for page_num in page_range:
|
| 122 |
+
if 0 <= page_num < len(doc):
|
| 123 |
+
text = doc[page_num].get_text("text")
|
| 124 |
+
chapters_content_list.append(text)
|
| 125 |
+
else:
|
| 126 |
+
print(f"Warning: Page number {page_num} is out of bounds.")
|
| 127 |
+
|
| 128 |
+
toc_text = "\n".join(chapters_content_list)
|
| 129 |
+
st.session_state["toc"] = toc_text
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def extract_chapters(chapters_dict, pages_data_corrected):
|
| 133 |
+
"""
|
| 134 |
+
Extract chapters from the provided JSON and pages data.
|
| 135 |
+
Args:
|
| 136 |
+
chapters_json (list): List of chapter dictionaries from the TOC.
|
| 137 |
+
pages_data_corrected (list): List of page data dictionaries with content.
|
| 138 |
+
Returns:
|
| 139 |
+
list: List of dictionaries, each containing chapter details and content.
|
| 140 |
+
"""
|
| 141 |
+
# Initialize an empty list to hold chapter dictionaries
|
| 142 |
+
chapters = []
|
| 143 |
+
|
| 144 |
+
# Iterate through each chapter in the JSON
|
| 145 |
+
for chapter in chapters_dict:
|
| 146 |
+
start_page = chapter['start_page']
|
| 147 |
+
end_page = chapter['end_page']
|
| 148 |
+
chapter_text = []
|
| 149 |
+
|
| 150 |
+
# Extract content for the chapter from the pages data
|
| 151 |
+
for chapter_range in range(start_page-1, end_page):
|
| 152 |
+
chapter_text.append(pages_data_corrected[chapter_range]['content'])
|
| 153 |
+
|
| 154 |
+
chapter_text = ' '.join(chapter_text)
|
| 155 |
+
|
| 156 |
+
# Create a dictionary for the chapter
|
| 157 |
+
chapter_dict = {
|
| 158 |
+
'chapter_number': chapter['chapter_number'],
|
| 159 |
+
'chapter_title': chapter['chapter_title'],
|
| 160 |
+
'start_page': start_page,
|
| 161 |
+
'end_page': end_page,
|
| 162 |
+
'content': chapter_text
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
chapters.append(chapter_dict)
|
| 166 |
+
|
| 167 |
+
st.session_state['chapters_extracted'] = chapters
|
app/backend/runpod_client.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import requests
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
import json
|
| 6 |
+
import codecs
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Load .env from project root
|
| 10 |
+
load_dotenv(dotenv_path=Path(__file__).resolve().parents[2] / ".env")
|
| 11 |
+
|
| 12 |
+
API_KEY = os.getenv("RUNPOD_API_KEY")
|
| 13 |
+
ENDPOINT = os.getenv("RUNPOD_ENDPOINT")
|
| 14 |
+
|
| 15 |
+
HEADERS = {
|
| 16 |
+
"Authorization": f"Bearer {API_KEY}",
|
| 17 |
+
"Content-Type": "application/json"
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def format_messages_as_prompt(messages):
|
| 22 |
+
"""Convert messages list to a single prompt string for the model."""
|
| 23 |
+
parts = []
|
| 24 |
+
for message in messages:
|
| 25 |
+
parts.append(f"{message['role'].capitalize()}: {message['content']}")
|
| 26 |
+
parts.append("Assistant:")
|
| 27 |
+
return "\n\n".join(parts)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def run_prompt(prompt: str) -> str:
|
| 31 |
+
"""Submit a prompt to the RunPod endpoint and get back a response string."""
|
| 32 |
+
payload = {"input":
|
| 33 |
+
{"prompt": prompt}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Start job
|
| 37 |
+
response = requests.post(f"{ENDPOINT}/run", headers=HEADERS, json=payload)
|
| 38 |
+
job_id = response.json().get("id")
|
| 39 |
+
print(f"[RunPod] Job started: {job_id}")
|
| 40 |
+
|
| 41 |
+
# Poll for status
|
| 42 |
+
while True:
|
| 43 |
+
status_res = requests.get(f"{ENDPOINT}/status/{job_id}", headers=HEADERS).json()
|
| 44 |
+
status = status_res.get("status")
|
| 45 |
+
print(f"[RunPod] Status: {status}")
|
| 46 |
+
if status in ("COMPLETED", "FAILED"):
|
| 47 |
+
break
|
| 48 |
+
time.sleep(3)
|
| 49 |
+
|
| 50 |
+
if status == "COMPLETED":
|
| 51 |
+
return status_res["output"]["response"]
|
| 52 |
+
else:
|
| 53 |
+
raise RuntimeError("RunPod job failed.")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def clean_and_parse_json(raw_text: str):
|
| 57 |
+
"""Clean and parse model output into JSON."""
|
| 58 |
+
cleaned = raw_text.strip().strip("```json").strip("```").strip("'")
|
| 59 |
+
try:
|
| 60 |
+
return json.loads(cleaned)
|
| 61 |
+
except json.JSONDecodeError:
|
| 62 |
+
try:
|
| 63 |
+
# Handle escaped quotes
|
| 64 |
+
unescaped = codecs.decode(cleaned, 'unicode_escape')
|
| 65 |
+
return json.loads(unescaped)
|
| 66 |
+
except Exception as e:
|
| 67 |
+
raise ValueError("Could not parse JSON output") from e
|
| 68 |
+
|
app/backend/text_processing.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from nltk.tokenize import sent_tokenize
|
| 2 |
+
import nltk
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
nltk.data.find("tokenizers/punkt")
|
| 7 |
+
except LookupError:
|
| 8 |
+
nltk.download("punkt")
|
| 9 |
+
|
| 10 |
+
def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
|
| 11 |
+
"""
|
| 12 |
+
Creates text chunks up to max_words using sentences as undivisible units.
|
| 13 |
+
Each chunk can overlap with the next one by overlap_sentences.
|
| 14 |
+
Chunks smaller than min_words are merged with the next chunk.
|
| 15 |
+
"""
|
| 16 |
+
sentences = sent_tokenize(text)
|
| 17 |
+
word_counts = [len(sentence.split()) for sentence in sentences]
|
| 18 |
+
|
| 19 |
+
chunks = []
|
| 20 |
+
i = 0
|
| 21 |
+
|
| 22 |
+
while i < len(sentences):
|
| 23 |
+
chunk_sentences = []
|
| 24 |
+
word_count = 0
|
| 25 |
+
chunk_start = i
|
| 26 |
+
|
| 27 |
+
# Build chunk
|
| 28 |
+
while i < len(sentences):
|
| 29 |
+
if word_count + word_counts[i] > max_words and chunk_sentences:
|
| 30 |
+
break
|
| 31 |
+
chunk_sentences.append(sentences[i])
|
| 32 |
+
word_count += word_counts[i]
|
| 33 |
+
i += 1
|
| 34 |
+
|
| 35 |
+
if chunk_sentences:
|
| 36 |
+
chunks.append(" ".join(chunk_sentences))
|
| 37 |
+
|
| 38 |
+
# Add overlap for next chunk
|
| 39 |
+
if i < len(sentences):
|
| 40 |
+
chunk_size = len(chunk_sentences)
|
| 41 |
+
overlap = min(overlap_sentences, chunk_size - 1)
|
| 42 |
+
i = max(i - overlap, chunk_start + 1)
|
| 43 |
+
|
| 44 |
+
# Merge small chunks with next chunk
|
| 45 |
+
merged_chunks = []
|
| 46 |
+
i = 0
|
| 47 |
+
while i < len(chunks):
|
| 48 |
+
current_chunk = chunks[i]
|
| 49 |
+
current_words = len(current_chunk.split())
|
| 50 |
+
|
| 51 |
+
# If current chunk is too small and there's a next chunk, merge them
|
| 52 |
+
if current_words < min_words and i + 1 < len(chunks):
|
| 53 |
+
next_chunk = chunks[i + 1]
|
| 54 |
+
next_words = len(next_chunk.split())
|
| 55 |
+
|
| 56 |
+
# Only merge if combined size won't be too large
|
| 57 |
+
if current_words + next_words <= max_words:
|
| 58 |
+
merged_chunk = current_chunk + " " + next_chunk
|
| 59 |
+
merged_chunks.append(merged_chunk)
|
| 60 |
+
i += 2 # Skip next chunk since we merged it
|
| 61 |
+
else:
|
| 62 |
+
# Keep small chunk as-is if merging would be too large
|
| 63 |
+
merged_chunks.append(current_chunk)
|
| 64 |
+
i += 1
|
| 65 |
+
else:
|
| 66 |
+
merged_chunks.append(current_chunk)
|
| 67 |
+
i += 1
|
| 68 |
+
|
| 69 |
+
# Remove chunks that are too long (likely data blocks or malformed content)
|
| 70 |
+
final_chunks = []
|
| 71 |
+
for chunk in merged_chunks:
|
| 72 |
+
if len(chunk.split()) <= 1000:
|
| 73 |
+
final_chunks.append(chunk)
|
| 74 |
+
|
| 75 |
+
return final_chunks
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def chapters_chunking(chapters, max_words=500, min_words=300, overlap_sentences=5):
|
| 79 |
+
"""
|
| 80 |
+
Chunk the chapters into smaller parts based on word count and overlap.
|
| 81 |
+
|
| 82 |
+
:param chapters: List of chapter dictionaries.
|
| 83 |
+
:param max_words: Maximum number of words per chunk.
|
| 84 |
+
:param min_words: Minimum number of words per chunk.
|
| 85 |
+
:param overlap_sentences: Number of sentences to overlap between chunks.
|
| 86 |
+
:return: List of dictionaries with chapter information and their respective chunks.
|
| 87 |
+
"""
|
| 88 |
+
st.session_state['chapters_chunked'] = [
|
| 89 |
+
{
|
| 90 |
+
'chapter_number': chapter['chapter_number'],
|
| 91 |
+
'chapter_title': chapter['chapter_title'],
|
| 92 |
+
'chunks': text_chunking(chapter['content'], max_words, min_words, overlap_sentences)
|
| 93 |
+
}
|
| 94 |
+
for chapter in chapters
|
| 95 |
+
]
|
app/chromadb_model/1_Pooling/config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"word_embedding_dimension": 384,
|
| 3 |
+
"pooling_mode_cls_token": false,
|
| 4 |
+
"pooling_mode_mean_tokens": true,
|
| 5 |
+
"pooling_mode_max_tokens": false,
|
| 6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
+
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
+
"pooling_mode_lasttoken": false,
|
| 9 |
+
"include_prompt": true
|
| 10 |
+
}
|
app/chromadb_model/README.md
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: en
|
| 3 |
+
license: apache-2.0
|
| 4 |
+
library_name: sentence-transformers
|
| 5 |
+
tags:
|
| 6 |
+
- sentence-transformers
|
| 7 |
+
- feature-extraction
|
| 8 |
+
- sentence-similarity
|
| 9 |
+
- transformers
|
| 10 |
+
datasets:
|
| 11 |
+
- s2orc
|
| 12 |
+
- flax-sentence-embeddings/stackexchange_xml
|
| 13 |
+
- ms_marco
|
| 14 |
+
- gooaq
|
| 15 |
+
- yahoo_answers_topics
|
| 16 |
+
- code_search_net
|
| 17 |
+
- search_qa
|
| 18 |
+
- eli5
|
| 19 |
+
- snli
|
| 20 |
+
- multi_nli
|
| 21 |
+
- wikihow
|
| 22 |
+
- natural_questions
|
| 23 |
+
- trivia_qa
|
| 24 |
+
- embedding-data/sentence-compression
|
| 25 |
+
- embedding-data/flickr30k-captions
|
| 26 |
+
- embedding-data/altlex
|
| 27 |
+
- embedding-data/simple-wiki
|
| 28 |
+
- embedding-data/QQP
|
| 29 |
+
- embedding-data/SPECTER
|
| 30 |
+
- embedding-data/PAQ_pairs
|
| 31 |
+
- embedding-data/WikiAnswers
|
| 32 |
+
pipeline_tag: sentence-similarity
|
| 33 |
+
---
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# all-MiniLM-L6-v2
|
| 37 |
+
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
| 38 |
+
|
| 39 |
+
## Usage (Sentence-Transformers)
|
| 40 |
+
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
| 41 |
+
|
| 42 |
+
```
|
| 43 |
+
pip install -U sentence-transformers
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Then you can use the model like this:
|
| 47 |
+
```python
|
| 48 |
+
from sentence_transformers import SentenceTransformer
|
| 49 |
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
| 50 |
+
|
| 51 |
+
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 52 |
+
embeddings = model.encode(sentences)
|
| 53 |
+
print(embeddings)
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
## Usage (HuggingFace Transformers)
|
| 57 |
+
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
| 58 |
+
|
| 59 |
+
```python
|
| 60 |
+
from transformers import AutoTokenizer, AutoModel
|
| 61 |
+
import torch
|
| 62 |
+
import torch.nn.functional as F
|
| 63 |
+
|
| 64 |
+
#Mean Pooling - Take attention mask into account for correct averaging
|
| 65 |
+
def mean_pooling(model_output, attention_mask):
|
| 66 |
+
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
| 67 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 68 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# Sentences we want sentence embeddings for
|
| 72 |
+
sentences = ['This is an example sentence', 'Each sentence is converted']
|
| 73 |
+
|
| 74 |
+
# Load model from HuggingFace Hub
|
| 75 |
+
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
| 76 |
+
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
| 77 |
+
|
| 78 |
+
# Tokenize sentences
|
| 79 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
| 80 |
+
|
| 81 |
+
# Compute token embeddings
|
| 82 |
+
with torch.no_grad():
|
| 83 |
+
model_output = model(**encoded_input)
|
| 84 |
+
|
| 85 |
+
# Perform pooling
|
| 86 |
+
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
| 87 |
+
|
| 88 |
+
# Normalize embeddings
|
| 89 |
+
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
| 90 |
+
|
| 91 |
+
print("Sentence embeddings:")
|
| 92 |
+
print(sentence_embeddings)
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
------
|
| 96 |
+
|
| 97 |
+
## Background
|
| 98 |
+
|
| 99 |
+
The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
|
| 100 |
+
contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
|
| 101 |
+
1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
|
| 102 |
+
|
| 103 |
+
We developed this model during the
|
| 104 |
+
[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
|
| 105 |
+
organized by Hugging Face. We developed this model as part of the project:
|
| 106 |
+
[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
|
| 107 |
+
|
| 108 |
+
## Intended uses
|
| 109 |
+
|
| 110 |
+
Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
|
| 111 |
+
the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
|
| 112 |
+
|
| 113 |
+
By default, input text longer than 256 word pieces is truncated.
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
## Training procedure
|
| 117 |
+
|
| 118 |
+
### Pre-training
|
| 119 |
+
|
| 120 |
+
We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
|
| 121 |
+
|
| 122 |
+
### Fine-tuning
|
| 123 |
+
|
| 124 |
+
We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
|
| 125 |
+
We then apply the cross entropy loss by comparing with true pairs.
|
| 126 |
+
|
| 127 |
+
#### Hyper parameters
|
| 128 |
+
|
| 129 |
+
We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
|
| 130 |
+
We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
|
| 131 |
+
a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
|
| 132 |
+
|
| 133 |
+
#### Training data
|
| 134 |
+
|
| 135 |
+
We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
|
| 136 |
+
We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
| Dataset | Paper | Number of training tuples |
|
| 140 |
+
|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
|
| 141 |
+
| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
|
| 142 |
+
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
|
| 143 |
+
| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
|
| 144 |
+
| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
|
| 145 |
+
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
|
| 146 |
+
| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
|
| 147 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
|
| 148 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
|
| 149 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
|
| 150 |
+
| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
|
| 151 |
+
| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
|
| 152 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
|
| 153 |
+
| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
|
| 154 |
+
| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
|
| 155 |
+
| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
|
| 156 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
|
| 157 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
|
| 158 |
+
| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
|
| 159 |
+
| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
|
| 160 |
+
| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
|
| 161 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
|
| 162 |
+
| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
|
| 163 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
|
| 164 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
|
| 165 |
+
| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
|
| 166 |
+
| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
|
| 167 |
+
| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
|
| 168 |
+
| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
|
| 169 |
+
| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
|
| 170 |
+
| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
|
| 171 |
+
| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
|
| 172 |
+
| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
|
| 173 |
+
| **Total** | | **1,170,060,424** |
|
app/chromadb_model/config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BertModel"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"gradient_checkpointing": false,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 384,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 1536,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 6,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"position_embedding_type": "absolute",
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.48.3",
|
| 23 |
+
"type_vocab_size": 2,
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"vocab_size": 30522
|
| 26 |
+
}
|
app/chromadb_model/config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"__version__": {
|
| 3 |
+
"sentence_transformers": "4.1.0",
|
| 4 |
+
"transformers": "4.48.3",
|
| 5 |
+
"pytorch": "2.6.0"
|
| 6 |
+
},
|
| 7 |
+
"prompts": {},
|
| 8 |
+
"default_prompt_name": null,
|
| 9 |
+
"similarity_fn_name": "cosine"
|
| 10 |
+
}
|
app/chromadb_model/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1377e9af0ca0b016a9f2aa584d6fc71ab3ea6804fae21ef9fb1416e2944057ac
|
| 3 |
+
size 90864192
|
app/chromadb_model/modules.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
| 18 |
+
"type": "sentence_transformers.models.Normalize"
|
| 19 |
+
}
|
| 20 |
+
]
|
app/chromadb_model/sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 256,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
app/chromadb_model/special_tokens_map.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": {
|
| 3 |
+
"content": "[CLS]",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"mask_token": {
|
| 10 |
+
"content": "[MASK]",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "[PAD]",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"sep_token": {
|
| 24 |
+
"content": "[SEP]",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"unk_token": {
|
| 31 |
+
"content": "[UNK]",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
}
|
| 37 |
+
}
|
app/chromadb_model/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/chromadb_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": false,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_basic_tokenize": true,
|
| 47 |
+
"do_lower_case": true,
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "[MASK]",
|
| 50 |
+
"max_length": 128,
|
| 51 |
+
"model_max_length": 256,
|
| 52 |
+
"never_split": null,
|
| 53 |
+
"pad_to_multiple_of": null,
|
| 54 |
+
"pad_token": "[PAD]",
|
| 55 |
+
"pad_token_type_id": 0,
|
| 56 |
+
"padding_side": "right",
|
| 57 |
+
"sep_token": "[SEP]",
|
| 58 |
+
"stride": 0,
|
| 59 |
+
"strip_accents": null,
|
| 60 |
+
"tokenize_chinese_chars": true,
|
| 61 |
+
"tokenizer_class": "BertTokenizer",
|
| 62 |
+
"truncation_side": "right",
|
| 63 |
+
"truncation_strategy": "longest_first",
|
| 64 |
+
"unk_token": "[UNK]"
|
| 65 |
+
}
|
app/chromadb_model/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/download_questions.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from docx import Document
|
| 3 |
+
from io import BytesIO
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def create_docx_from_data(data):
|
| 8 |
+
doc = Document()
|
| 9 |
+
doc.add_heading("Questions", 0)
|
| 10 |
+
doc.add_paragraph(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
| 11 |
+
|
| 12 |
+
for chapter, qas in data.items():
|
| 13 |
+
doc.add_heading(chapter, level=1)
|
| 14 |
+
doc.add_paragraph("") # Spacing
|
| 15 |
+
for idx, qa in enumerate(qas, 1):
|
| 16 |
+
doc.add_paragraph(f"Q{idx}: {qa['question']}", style='List Number')
|
| 17 |
+
doc.add_paragraph(f"A: {qa['answer']}", style='Normal')
|
| 18 |
+
doc.add_paragraph("") # Spacing
|
| 19 |
+
|
| 20 |
+
buffer = BytesIO()
|
| 21 |
+
doc.save(buffer)
|
| 22 |
+
buffer.seek(0)
|
| 23 |
+
return buffer
|
app/main.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from utils import *
|
| 3 |
+
from main_IO import *
|
| 4 |
+
from download_questions import create_docx_from_data
|
| 5 |
+
from backend.raw_text_processing import *
|
| 6 |
+
from backend.chromadb_utils import *
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Add the root folder (one level above 'app') to sys.path
|
| 13 |
+
root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 14 |
+
if root_path not in sys.path:
|
| 15 |
+
sys.path.insert(0, root_path)
|
| 16 |
+
|
| 17 |
+
# Configuration
|
| 18 |
+
configure_page()
|
| 19 |
+
initialise_session_state()
|
| 20 |
+
apply_style()
|
| 21 |
+
|
| 22 |
+
# add_sidebar_header()
|
| 23 |
+
st.sidebar.html("""
|
| 24 |
+
<div style='position: fixed; top: 10px; left: 20px; z-index: 999; padding: 10px;'>
|
| 25 |
+
<h3>Menu</h3>
|
| 26 |
+
</div>
|
| 27 |
+
""")
|
| 28 |
+
|
| 29 |
+
# Initialize chromadb variables
|
| 30 |
+
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
| 31 |
+
model_path = "./chromadb_model"
|
| 32 |
+
|
| 33 |
+
# Set-up Logger
|
| 34 |
+
st.session_state.use_logger = False
|
| 35 |
+
if st.session_state.use_logger:
|
| 36 |
+
level = st.selectbox("Logging level", ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
|
| 37 |
+
logging.getLogger().setLevel(getattr(logging, level))
|
| 38 |
+
|
| 39 |
+
# Set default page if not specified
|
| 40 |
+
if "page" not in st.query_params:
|
| 41 |
+
st.query_params.page = "main"
|
| 42 |
+
|
| 43 |
+
# Navigation handling
|
| 44 |
+
if st.query_params.page == "topic":
|
| 45 |
+
st.switch_page("pages/2_topic_questions.py")
|
| 46 |
+
elif st.query_params.page == "chapter":
|
| 47 |
+
st.switch_page("pages/1_chapter_questions.py")
|
| 48 |
+
elif st.query_params.page == "inspect":
|
| 49 |
+
st.switch_page("pages/3_inspect_pdf.py")
|
| 50 |
+
else:
|
| 51 |
+
# Welcome message
|
| 52 |
+
st.title("Welcome to Text2Test!")
|
| 53 |
+
st.divider()
|
| 54 |
+
st.markdown("""
|
| 55 |
+
Welcome! This app helps you transform your PDFs or texts into interactive study materials by generating meaningful questions.
|
| 56 |
+
You can either:
|
| 57 |
+
|
| 58 |
+
- Generate questions based on specific topics or keywords
|
| 59 |
+
- Generate questions from a selected chapter
|
| 60 |
+
|
| 61 |
+
Start by uploading your PDF file, then choose your preferred way to generate questions using the options below.
|
| 62 |
+
Let’s make studying smarter and more engaging!
|
| 63 |
+
""")
|
| 64 |
+
st.divider()
|
| 65 |
+
|
| 66 |
+
# Upload PDF file
|
| 67 |
+
st.subheader("Upload your PDF file")
|
| 68 |
+
upload_pdf()
|
| 69 |
+
st.divider()
|
| 70 |
+
|
| 71 |
+
# Check if PDF has changed or needs processing
|
| 72 |
+
if st.session_state.get("pdf_changed") or (
|
| 73 |
+
st.session_state.get("full_text") is None and
|
| 74 |
+
st.session_state.get("uploaded_pdf_bytes") is not None
|
| 75 |
+
):
|
| 76 |
+
process_pdf() # Extract text from PDF
|
| 77 |
+
|
| 78 |
+
with st.spinner("Extracting information from the text..."):
|
| 79 |
+
client, embedding_func = initialize_chromadb(EMBEDDING_MODEL)
|
| 80 |
+
whole_text_collection = initialize_collection(client, embedding_func, "whole_text_chunks")
|
| 81 |
+
update_collection(
|
| 82 |
+
whole_text_collection,
|
| 83 |
+
st.session_state.get("full_text"),
|
| 84 |
+
max_words=200,
|
| 85 |
+
min_words=100,
|
| 86 |
+
overlap_sentences=3
|
| 87 |
+
)
|
| 88 |
+
st.session_state["pdf_changed"] = False # Reset flag after processing
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
uploaded_pdf_name = st.session_state.get('uploaded_pdf_name', None)
|
| 92 |
+
if uploaded_pdf_name:
|
| 93 |
+
st.info(f"Uploaded PDF: {uploaded_pdf_name}")
|
| 94 |
+
debug_log(f"book title: {uploaded_pdf_name}")
|
| 95 |
+
else:
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
show_pdf_preview()
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
debug_log(f"Error displaying PDF info or preview: {e}")
|
| 102 |
+
|
| 103 |
+
# Main content buttons
|
| 104 |
+
st.subheader("Generate Questions")
|
| 105 |
+
st.write("Please choose an option to generate questions:")
|
| 106 |
+
breaks(1)
|
| 107 |
+
cols = st.columns(2)
|
| 108 |
+
st.html("""
|
| 109 |
+
<style>
|
| 110 |
+
div.stButton {
|
| 111 |
+
display: flex;
|
| 112 |
+
justify-content: center;
|
| 113 |
+
margin: 10px 0;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
div.stButton > button:first-child {
|
| 117 |
+
width: 80%;
|
| 118 |
+
padding: 40px 0;
|
| 119 |
+
background-color: #f0f0f0 !important;
|
| 120 |
+
border: none !important;
|
| 121 |
+
border-radius: 10px !important;
|
| 122 |
+
color: #333 !important;
|
| 123 |
+
font-family: 'Work Sans', sans-serif !important;
|
| 124 |
+
font-weight: 600 !important;
|
| 125 |
+
transition: all 0.3s ease;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
/* Target the button text directly */
|
| 129 |
+
div.stButton > button:first-child p,
|
| 130 |
+
div.stButton > button:first-child span,
|
| 131 |
+
div.stButton > button:first-child div,
|
| 132 |
+
div.stButton > button:first-child {
|
| 133 |
+
font-size: 24px !important;
|
| 134 |
+
line-height: 1.2 !important;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
div.stButton > button:first-child:hover {
|
| 138 |
+
background-color: #e0e0e0 !important;
|
| 139 |
+
transform: translateY(-2px);
|
| 140 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
| 141 |
+
}
|
| 142 |
+
</style>
|
| 143 |
+
""")
|
| 144 |
+
|
| 145 |
+
with cols[0]:
|
| 146 |
+
if st.button("Generate Questions on a Topic", key="main_topic"):
|
| 147 |
+
st.query_params.page = "topic"
|
| 148 |
+
st.rerun()
|
| 149 |
+
with cols[1]:
|
| 150 |
+
if st.button("Generate Questions from a Chapter", key="main_chapter"):
|
| 151 |
+
st.query_params.page = "chapter"
|
| 152 |
+
st.rerun()
|
| 153 |
+
|
| 154 |
+
if st.session_state.get('questions_to_download'):
|
| 155 |
+
with st.sidebar:
|
| 156 |
+
st.markdown("---") # Divider
|
| 157 |
+
st.markdown("**Download Questions**") # Spacing
|
| 158 |
+
|
| 159 |
+
docx_file = create_docx_from_data(st.session_state.get('questions_to_download', {}))
|
| 160 |
+
|
| 161 |
+
st.download_button(
|
| 162 |
+
label="📄 Download as Word (.docx)",
|
| 163 |
+
data=docx_file,
|
| 164 |
+
file_name="questions.docx",
|
| 165 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 166 |
+
on_click="ignore"
|
| 167 |
+
)
|
| 168 |
+
else:
|
| 169 |
+
with st.sidebar:
|
| 170 |
+
st.markdown("---")
|
app/main_IO.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import fitz # PyMuPDF
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import io
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
DEFAULT_SESSION_STATE = {
|
| 8 |
+
# PDF Upload
|
| 9 |
+
'doc': None,
|
| 10 |
+
'uploaded_pdf_name': None,
|
| 11 |
+
'pdf_changed': False,
|
| 12 |
+
'uploaded_pdf_bytes': None,
|
| 13 |
+
'page_range_set' : False,
|
| 14 |
+
'page_range_updated' : False,
|
| 15 |
+
'full_text': None,
|
| 16 |
+
'pages_data_infos': None,
|
| 17 |
+
|
| 18 |
+
# TOC
|
| 19 |
+
'page_choice': None,
|
| 20 |
+
'toc_page_range': None,
|
| 21 |
+
'toc': None,
|
| 22 |
+
|
| 23 |
+
# Chapters
|
| 24 |
+
'chapters_starting_page': None,
|
| 25 |
+
'chapters_dict': None,
|
| 26 |
+
'chapters_extracted': None,
|
| 27 |
+
'chapters_chunked': None,
|
| 28 |
+
'selected_chapter_idx': None,
|
| 29 |
+
'selected_chapter_title': None,
|
| 30 |
+
'num_questions': None,
|
| 31 |
+
'chapter_selected_chunks': None,
|
| 32 |
+
'chapter_prompt': None,
|
| 33 |
+
|
| 34 |
+
# Topics
|
| 35 |
+
'query': None,
|
| 36 |
+
'questions_ready_topic': False,
|
| 37 |
+
|
| 38 |
+
# Questions
|
| 39 |
+
'questions_dict_chapter': None,
|
| 40 |
+
'questions_dict_topic': None,
|
| 41 |
+
'raw_output': None, # remove this (only for debug)
|
| 42 |
+
'questions_ready_chapter': False,
|
| 43 |
+
'questions_to_download' : {}
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def initialise_session_state():
|
| 48 |
+
"""
|
| 49 |
+
Initializes the session state variables if not already set.
|
| 50 |
+
"""
|
| 51 |
+
for key, default_val in DEFAULT_SESSION_STATE.items():
|
| 52 |
+
if key not in st.session_state:
|
| 53 |
+
st.session_state[key] = default_val
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def reset_session_state_on_upload():
|
| 57 |
+
"""
|
| 58 |
+
Resets session state variables to their default values.
|
| 59 |
+
"""
|
| 60 |
+
for key, default_val in DEFAULT_SESSION_STATE.items():
|
| 61 |
+
if key != 'questions_to_download':
|
| 62 |
+
st.session_state[key] = default_val
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def upload_pdf():
|
| 66 |
+
uploaded_file = st.file_uploader("", type=["pdf"])
|
| 67 |
+
|
| 68 |
+
if uploaded_file is not None:
|
| 69 |
+
prev_file = st.session_state.get('uploaded_pdf_name')
|
| 70 |
+
if uploaded_file.name != prev_file:
|
| 71 |
+
# New file detected
|
| 72 |
+
reset_session_state_on_upload()
|
| 73 |
+
st.session_state['pdf_changed'] = True
|
| 74 |
+
else:
|
| 75 |
+
st.session_state['pdf_changed'] = False
|
| 76 |
+
|
| 77 |
+
pdf_bytes = uploaded_file.read()
|
| 78 |
+
|
| 79 |
+
if pdf_bytes:
|
| 80 |
+
st.session_state['uploaded_pdf_bytes'] = pdf_bytes
|
| 81 |
+
st.session_state['uploaded_pdf_name'] = uploaded_file.name
|
| 82 |
+
st.success(f"File '{uploaded_file.name}' uploaded successfully!")
|
| 83 |
+
else:
|
| 84 |
+
st.error("Uploaded file is empty!")
|
| 85 |
+
|
| 86 |
+
elif uploaded_file is None and st.session_state.get('uploaded_pdf_bytes') is not None:
|
| 87 |
+
st.success("File uploaded successfully!")
|
| 88 |
+
else:
|
| 89 |
+
st.info("Please upload a PDF file to proceed.")
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def show_pdf_preview():
|
| 93 |
+
if 'uploaded_pdf_bytes' in st.session_state:
|
| 94 |
+
pdf_bytes = st.session_state['uploaded_pdf_bytes']
|
| 95 |
+
doc = None
|
| 96 |
+
try:
|
| 97 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 98 |
+
if doc.page_count < 1:
|
| 99 |
+
st.sidebar.error("PDF has no pages!")
|
| 100 |
+
return
|
| 101 |
+
page = doc.load_page(0)
|
| 102 |
+
pix = page.get_pixmap()
|
| 103 |
+
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
| 104 |
+
st.sidebar.image(img, caption="First page preview", use_container_width=True)
|
| 105 |
+
except Exception as e:
|
| 106 |
+
st.sidebar.error(f"Failed to open PDF: {e}")
|
| 107 |
+
finally:
|
| 108 |
+
if doc is not None:
|
| 109 |
+
doc.close()
|
| 110 |
+
else:
|
| 111 |
+
st.sidebar.write("Upload a PDF to see a preview here.")
|
| 112 |
+
|
app/pages/1_chapter_questions.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from app.utils import *
|
| 3 |
+
from app.main_IO import *
|
| 4 |
+
from app.pages.utils_chapter.display_pages import *
|
| 5 |
+
from app.pages.utils_chapter.display_questions import *
|
| 6 |
+
from app.pages.utils_chapter.chapter_extraction import *
|
| 7 |
+
from app.pages.utils_chapter.chapter_selection import *
|
| 8 |
+
from app.download_questions import create_docx_from_data
|
| 9 |
+
|
| 10 |
+
# Set up logger
|
| 11 |
+
if st.session_state.use_logger:
|
| 12 |
+
level = st.selectbox("Logging level", ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
|
| 13 |
+
logging.getLogger().setLevel(getattr(logging, level))
|
| 14 |
+
else:
|
| 15 |
+
logging.getLogger().setLevel(logging.CRITICAL + 1)
|
| 16 |
+
|
| 17 |
+
# Initialise
|
| 18 |
+
apply_style()
|
| 19 |
+
|
| 20 |
+
# add_sidebar_header()
|
| 21 |
+
st.sidebar.html("""
|
| 22 |
+
<div style='position: fixed; top: 10px; left: 20px; z-index: 999; padding: 10px;'>
|
| 23 |
+
<h3>Menu</h3>
|
| 24 |
+
</div>
|
| 25 |
+
""")
|
| 26 |
+
|
| 27 |
+
show_pdf_preview()
|
| 28 |
+
st.title("Generate Questions from a Chapter")
|
| 29 |
+
st.divider()
|
| 30 |
+
st.write("""
|
| 31 |
+
Here you can generate questions based on a specific chapter.
|
| 32 |
+
To do this, please first select the page range that includes the Table of Contents (TOC) — sometimes called the index or contents page — which lists the chapters and their page numbers.
|
| 33 |
+
|
| 34 |
+
This step is important because it helps the app automatically identify and locate chapters, so you can easily choose the exact chapter to generate questions from.
|
| 35 |
+
""")
|
| 36 |
+
|
| 37 |
+
# Display the page range selector
|
| 38 |
+
breaks(1)
|
| 39 |
+
display_scrollable_pages()
|
| 40 |
+
|
| 41 |
+
# UI and Interaction
|
| 42 |
+
set_clicked, start_page, end_page = page_range_selector_ui()
|
| 43 |
+
|
| 44 |
+
if set_clicked:
|
| 45 |
+
updated = handle_page_range_submission(start_page, end_page)
|
| 46 |
+
st.session_state["page_range_updated"] = updated
|
| 47 |
+
|
| 48 |
+
if st.session_state.get("page_range_updated", False):
|
| 49 |
+
extract_content_if_needed()
|
| 50 |
+
st.session_state["page_range_updated"] = False
|
| 51 |
+
|
| 52 |
+
# Gate rest of app
|
| 53 |
+
if st.session_state.get("page_range_set", False):
|
| 54 |
+
|
| 55 |
+
# Call the form in your main app code to generate questions
|
| 56 |
+
result = chapter_question_form()
|
| 57 |
+
if result:
|
| 58 |
+
st.session_state.questions_dict_chapter = result
|
| 59 |
+
debug_log(f"questions: {st.session_state.get('questions_dict_chapter', 'None')}")
|
| 60 |
+
|
| 61 |
+
if st.session_state.get("questions_ready_chapter"):
|
| 62 |
+
breaks(2)
|
| 63 |
+
st.subheader("Generated Questions")
|
| 64 |
+
st.divider()
|
| 65 |
+
# Visualize generated questions and store them
|
| 66 |
+
show_questions(st.session_state.get('questions_dict_chapter'))
|
| 67 |
+
breaks(1)
|
| 68 |
+
show_download_controls(st.session_state.get('selected_chapter_title'), st.session_state.get('questions_dict_chapter', 'None'))
|
| 69 |
+
debug_show_selected_questions()
|
| 70 |
+
|
| 71 |
+
with st.sidebar:
|
| 72 |
+
st.markdown("---") # Divider
|
| 73 |
+
st.markdown("**Download Questions**") # Spacing
|
| 74 |
+
|
| 75 |
+
docx_file = create_docx_from_data(st.session_state['questions_to_download'])
|
| 76 |
+
|
| 77 |
+
st.download_button(
|
| 78 |
+
label="📄 Download as Word (.docx)",
|
| 79 |
+
data=docx_file,
|
| 80 |
+
file_name="questions.docx",
|
| 81 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 82 |
+
on_click="ignore"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
else:
|
| 86 |
+
st.info("Please set a valid page range to continue.")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
app/pages/2_topic_questions.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import chromadb
|
| 3 |
+
from app.utils import *
|
| 4 |
+
from app.main_IO import *
|
| 5 |
+
from app.download_questions import create_docx_from_data
|
| 6 |
+
from app.pages.utils_chapter.display_questions import *
|
| 7 |
+
from app.pages.utils_chapter.chapter_selection import select_num_questions
|
| 8 |
+
from app.backend.chunks_processing import query_collection
|
| 9 |
+
from app.backend.messages_templates import book_prompt
|
| 10 |
+
from app.backend.runpod_client import run_prompt, clean_and_parse_json
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Initialise
|
| 14 |
+
apply_style()
|
| 15 |
+
|
| 16 |
+
# add_sidebar_header()
|
| 17 |
+
st.sidebar.html("""
|
| 18 |
+
<div style='position: fixed; top: 10px; left: 20px; z-index: 999; padding: 10px;'>
|
| 19 |
+
<h3>Menu</h3>
|
| 20 |
+
</div>
|
| 21 |
+
""")
|
| 22 |
+
|
| 23 |
+
show_pdf_preview()
|
| 24 |
+
st.title("Generate Questions on a Topic")
|
| 25 |
+
st.divider()
|
| 26 |
+
st.write("""Here, you can generate questions based on a specific topic.
|
| 27 |
+
You can enter a topic or keyword, and the app will generate questions based on the content of the uploaded PDF.""")
|
| 28 |
+
|
| 29 |
+
breaks(1)
|
| 30 |
+
|
| 31 |
+
# Set up logger
|
| 32 |
+
if st.session_state.use_logger:
|
| 33 |
+
level = st.selectbox("Logging level", ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
|
| 34 |
+
logging.getLogger().setLevel(getattr(logging, level))
|
| 35 |
+
|
| 36 |
+
if st.session_state.get("full_text", None) is not None:
|
| 37 |
+
client = chromadb.Client() # Use same client init/config
|
| 38 |
+
whole_text_collection = client.get_collection("whole_text_chunks")
|
| 39 |
+
debug_log(f"Collection name: {whole_text_collection.name}")
|
| 40 |
+
debug_log(f"Number of documents: {whole_text_collection.count()}")
|
| 41 |
+
results = whole_text_collection.get(limit=1)
|
| 42 |
+
|
| 43 |
+
documents = results['documents'] # List of text chunks
|
| 44 |
+
metadatas = results['metadatas'] # List of metadata dicts, e.g. chunk indexes
|
| 45 |
+
|
| 46 |
+
for i, (doc, meta) in enumerate(zip(documents, metadatas)):
|
| 47 |
+
debug_log(f"Chunk {i}:")
|
| 48 |
+
debug_log(doc)
|
| 49 |
+
debug_log(f"Metadata: {meta}")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
with st.form("query_form"):
|
| 53 |
+
st.subheader("Enter a Topic or Keyword")
|
| 54 |
+
query = st.text_input("Enter your query:")
|
| 55 |
+
col1, _ = st.columns([2, 6])
|
| 56 |
+
with col1:
|
| 57 |
+
num_questions = select_num_questions()
|
| 58 |
+
breaks(1)
|
| 59 |
+
submitted = st.form_submit_button("Submit")
|
| 60 |
+
|
| 61 |
+
if submitted and query:
|
| 62 |
+
with st.spinner("Generating questions..."):
|
| 63 |
+
# Generate questions based on the query
|
| 64 |
+
query_context = query_collection(whole_text_collection, query=query, nresults=3, context_multiplier=2)
|
| 65 |
+
prompt = book_prompt(query_context, num_questions=num_questions, user_query=query)
|
| 66 |
+
questions_json = run_prompt(prompt)
|
| 67 |
+
st.session_state.questions_dict_topic = clean_and_parse_json(questions_json)
|
| 68 |
+
st.session_state['query'] = query
|
| 69 |
+
st.session_state['questions_ready_topic'] = True
|
| 70 |
+
|
| 71 |
+
if st.session_state.get("questions_ready_topic"):
|
| 72 |
+
breaks(2)
|
| 73 |
+
st.subheader("Generated Questions")
|
| 74 |
+
st.divider()
|
| 75 |
+
debug_log(f"Generated questions: {st.session_state.get('questions_dict_topic', 'None')}")
|
| 76 |
+
|
| 77 |
+
# Visualize generated questions and store them
|
| 78 |
+
show_questions(st.session_state['questions_dict_topic'])
|
| 79 |
+
breaks(1)
|
| 80 |
+
show_download_controls(st.session_state.get('query'), st.session_state.get('questions_dict_topic', 'None'))
|
| 81 |
+
debug_show_selected_questions()
|
| 82 |
+
|
| 83 |
+
with st.sidebar:
|
| 84 |
+
st.divider() # Divider
|
| 85 |
+
st.markdown("**Download Questions**") # Spacing
|
| 86 |
+
|
| 87 |
+
docx_file = create_docx_from_data(st.session_state.get('questions_to_download', {}))
|
| 88 |
+
|
| 89 |
+
st.download_button(
|
| 90 |
+
label="📄 Download as Word (.docx)",
|
| 91 |
+
data=docx_file,
|
| 92 |
+
file_name="questions.docx",
|
| 93 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 94 |
+
on_click="ignore"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# query_context = query_collection(whole_text_collection, query=query, nresults=3, context_multiplier=2)
|
| 101 |
+
# out3 = book_prompt(query_context, num_questions=3, user_query=query)
|
| 102 |
+
# questions = run_prompt(out)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# use https://docs.streamlit.io/develop/api-reference/chat/st.chat_input
|
| 106 |
+
# or https://docs.streamlit.io/develop/api-reference/widgets/st.text_input
|
app/pages/3_inspect_pdf.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from app.utils import *
|
| 3 |
+
|
| 4 |
+
# Initialise
|
| 5 |
+
apply_style()
|
| 6 |
+
|
| 7 |
+
st.title("Work in Progress: Inspect PDF")
|
| 8 |
+
|
app/pages/__init__.py
ADDED
|
File without changes
|
app/pages/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (161 Bytes). View file
|
|
|
app/pages/__pycache__/page1_utils.cpython-312.pyc
ADDED
|
Binary file (4.7 kB). View file
|
|
|
app/pages/utils_chapter/__init__.py
ADDED
|
File without changes
|
app/pages/utils_chapter/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (175 Bytes). View file
|
|
|
app/pages/utils_chapter/__pycache__/chapter_extraction.cpython-312.pyc
ADDED
|
Binary file (5.1 kB). View file
|
|
|
app/pages/utils_chapter/__pycache__/chapter_selection.cpython-312.pyc
ADDED
|
Binary file (4.56 kB). View file
|
|
|
app/pages/utils_chapter/__pycache__/display_pages.cpython-312.pyc
ADDED
|
Binary file (5.76 kB). View file
|
|
|
app/pages/utils_chapter/__pycache__/display_questions.cpython-312.pyc
ADDED
|
Binary file (5.75 kB). View file
|
|
|
app/pages/utils_chapter/__pycache__/download_questions.cpython-312.pyc
ADDED
|
Binary file (1.46 kB). View file
|
|
|
app/pages/utils_chapter/__pycache__/page1_utils.cpython-312.pyc
ADDED
|
Binary file (6.09 kB). View file
|
|
|