Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import re | |
| import faiss | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import requests | |
| # Load embedding model | |
| embed_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| def extract_pdf_by_page(path): | |
| doc = fitz.open(path) | |
| pages = [] | |
| for page_num, page in enumerate(doc, 1): | |
| text = page.get_text() | |
| pages.append({"page_number": page_num, "text": text}) | |
| return pages | |
| def split_by_chapter(pages): | |
| chapters = [] | |
| current = {"title": "Introduction", "text": "", "pages": []} | |
| chapter_pattern = re.compile(r'chapter\s+\d+[:.\s]', re.IGNORECASE) | |
| for page in pages: | |
| if chapter_pattern.search(page["text"]): | |
| chapters.append(current) | |
| current = { | |
| "title": chapter_pattern.search(page["text"]).group().strip(), | |
| "text": page["text"], | |
| "pages": [page["page_number"]], | |
| } | |
| else: | |
| current["text"] += "\n" + page["text"] | |
| current["pages"].append(page["page_number"]) | |
| chapters.append(current) | |
| return chapters | |
| def build_faiss_index(chunks): | |
| texts = [chunk["text"] for chunk in chunks] | |
| embeddings = embed_model.encode(texts, convert_to_numpy=True) | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) | |
| index.add(embeddings) | |
| return index, texts, chunks | |
| def retrieve_text(query, index, texts, chunks, top_k=1): | |
| query_vec = embed_model.encode([query]) | |
| D, I = index.search(query_vec, top_k) | |
| return [chunks[i] for i in I[0]] | |
| def generate_notes_questions(text, groq_api_key, model="meta-llama/llama-4-scout-17b-16e-instruct"): | |
| prompt = f""" | |
| You are an educational assistant. Given the following content, generate: | |
| 1. Bullet point summary notes. | |
| 2. Five comprehension questions with their answers. | |
| --- BEGIN CONTENT --- | |
| {text[:3000]} | |
| --- END CONTENT --- | |
| """ | |
| headers = { | |
| "Authorization": f"Bearer {groq_api_key}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "model": model, | |
| "messages": [ | |
| {"role": "user", "content": prompt} | |
| ], | |
| "temperature": 0.7 | |
| } | |
| url = "https://api.groq.com/openai/v1/chat/completions" | |
| response = requests.post(url, headers=headers, json=payload) | |
| response.raise_for_status() | |
| return response.json()['choices'][0]['message']['content'] | |