File size: 2,375 Bytes
1222b5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import fitz  # PyMuPDF
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import requests

# Load embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_pdf_by_page(path):
    doc = fitz.open(path)
    pages = []
    for page_num, page in enumerate(doc, 1):
        text = page.get_text()
        pages.append({"page_number": page_num, "text": text})
    return pages

def split_by_chapter(pages):
    chapters = []
    current = {"title": "Introduction", "text": "", "pages": []}
    chapter_pattern = re.compile(r'chapter\s+\d+[:.\s]', re.IGNORECASE)

    for page in pages:
        if chapter_pattern.search(page["text"]):
            chapters.append(current)
            current = {
                "title": chapter_pattern.search(page["text"]).group().strip(),
                "text": page["text"],
                "pages": [page["page_number"]],
            }
        else:
            current["text"] += "\n" + page["text"]
            current["pages"].append(page["page_number"])
    chapters.append(current)
    return chapters

def build_faiss_index(chunks):
    texts = [chunk["text"] for chunk in chunks]
    embeddings = embed_model.encode(texts, convert_to_numpy=True)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index, texts, chunks

def retrieve_text(query, index, texts, chunks, top_k=1):
    query_vec = embed_model.encode([query])
    D, I = index.search(query_vec, top_k)
    return [chunks[i] for i in I[0]]

def generate_notes_questions(text, groq_api_key, model="meta-llama/llama-4-scout-17b-16e-instruct"):
    prompt = f"""
You are an educational assistant. Given the following content, generate:
1. Bullet point summary notes.
2. Five comprehension questions with their answers.

--- BEGIN CONTENT ---
{text[:3000]}
--- END CONTENT ---
"""

    headers = {
        "Authorization": f"Bearer {groq_api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.7
    }

    url = "https://api.groq.com/openai/v1/chat/completions"
    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()['choices'][0]['message']['content']