Notes-Generator / utils.py
asaeed23's picture
Create utils.py
1222b5e verified
import fitz # PyMuPDF
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import requests
# Load embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
def extract_pdf_by_page(path):
doc = fitz.open(path)
pages = []
for page_num, page in enumerate(doc, 1):
text = page.get_text()
pages.append({"page_number": page_num, "text": text})
return pages
def split_by_chapter(pages):
chapters = []
current = {"title": "Introduction", "text": "", "pages": []}
chapter_pattern = re.compile(r'chapter\s+\d+[:.\s]', re.IGNORECASE)
for page in pages:
if chapter_pattern.search(page["text"]):
chapters.append(current)
current = {
"title": chapter_pattern.search(page["text"]).group().strip(),
"text": page["text"],
"pages": [page["page_number"]],
}
else:
current["text"] += "\n" + page["text"]
current["pages"].append(page["page_number"])
chapters.append(current)
return chapters
def build_faiss_index(chunks):
texts = [chunk["text"] for chunk in chunks]
embeddings = embed_model.encode(texts, convert_to_numpy=True)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return index, texts, chunks
def retrieve_text(query, index, texts, chunks, top_k=1):
query_vec = embed_model.encode([query])
D, I = index.search(query_vec, top_k)
return [chunks[i] for i in I[0]]
def generate_notes_questions(text, groq_api_key, model="meta-llama/llama-4-scout-17b-16e-instruct"):
prompt = f"""
You are an educational assistant. Given the following content, generate:
1. Bullet point summary notes.
2. Five comprehension questions with their answers.
--- BEGIN CONTENT ---
{text[:3000]}
--- END CONTENT ---
"""
headers = {
"Authorization": f"Bearer {groq_api_key}",
"Content-Type": "application/json"
}
payload = {
"model": model,
"messages": [
{"role": "user", "content": prompt}
],
"temperature": 0.7
}
url = "https://api.groq.com/openai/v1/chat/completions"
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
return response.json()['choices'][0]['message']['content']