import fitz # PyMuPDF import re import faiss import numpy as np from sentence_transformers import SentenceTransformer import requests # Load embedding model embed_model = SentenceTransformer("all-MiniLM-L6-v2") def extract_pdf_by_page(path): doc = fitz.open(path) pages = [] for page_num, page in enumerate(doc, 1): text = page.get_text() pages.append({"page_number": page_num, "text": text}) return pages def split_by_chapter(pages): chapters = [] current = {"title": "Introduction", "text": "", "pages": []} chapter_pattern = re.compile(r'chapter\s+\d+[:.\s]', re.IGNORECASE) for page in pages: if chapter_pattern.search(page["text"]): chapters.append(current) current = { "title": chapter_pattern.search(page["text"]).group().strip(), "text": page["text"], "pages": [page["page_number"]], } else: current["text"] += "\n" + page["text"] current["pages"].append(page["page_number"]) chapters.append(current) return chapters def build_faiss_index(chunks): texts = [chunk["text"] for chunk in chunks] embeddings = embed_model.encode(texts, convert_to_numpy=True) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) return index, texts, chunks def retrieve_text(query, index, texts, chunks, top_k=1): query_vec = embed_model.encode([query]) D, I = index.search(query_vec, top_k) return [chunks[i] for i in I[0]] def generate_notes_questions(text, groq_api_key, model="meta-llama/llama-4-scout-17b-16e-instruct"): prompt = f""" You are an educational assistant. Given the following content, generate: 1. Bullet point summary notes. 2. Five comprehension questions with their answers. --- BEGIN CONTENT --- {text[:3000]} --- END CONTENT --- """ headers = { "Authorization": f"Bearer {groq_api_key}", "Content-Type": "application/json" } payload = { "model": model, "messages": [ {"role": "user", "content": prompt} ], "temperature": 0.7 } url = "https://api.groq.com/openai/v1/chat/completions" response = requests.post(url, headers=headers, json=payload) response.raise_for_status() return response.json()['choices'][0]['message']['content']