asaeed23 commited on
Commit
1222b5e
·
verified ·
1 Parent(s): 2c43ba7

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +76 -0
utils.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import re
3
+ import faiss
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ import requests
7
+
8
+ # Load embedding model
9
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
10
+
11
+ def extract_pdf_by_page(path):
12
+ doc = fitz.open(path)
13
+ pages = []
14
+ for page_num, page in enumerate(doc, 1):
15
+ text = page.get_text()
16
+ pages.append({"page_number": page_num, "text": text})
17
+ return pages
18
+
19
+ def split_by_chapter(pages):
20
+ chapters = []
21
+ current = {"title": "Introduction", "text": "", "pages": []}
22
+ chapter_pattern = re.compile(r'chapter\s+\d+[:.\s]', re.IGNORECASE)
23
+
24
+ for page in pages:
25
+ if chapter_pattern.search(page["text"]):
26
+ chapters.append(current)
27
+ current = {
28
+ "title": chapter_pattern.search(page["text"]).group().strip(),
29
+ "text": page["text"],
30
+ "pages": [page["page_number"]],
31
+ }
32
+ else:
33
+ current["text"] += "\n" + page["text"]
34
+ current["pages"].append(page["page_number"])
35
+ chapters.append(current)
36
+ return chapters
37
+
38
+ def build_faiss_index(chunks):
39
+ texts = [chunk["text"] for chunk in chunks]
40
+ embeddings = embed_model.encode(texts, convert_to_numpy=True)
41
+ index = faiss.IndexFlatL2(embeddings.shape[1])
42
+ index.add(embeddings)
43
+ return index, texts, chunks
44
+
45
+ def retrieve_text(query, index, texts, chunks, top_k=1):
46
+ query_vec = embed_model.encode([query])
47
+ D, I = index.search(query_vec, top_k)
48
+ return [chunks[i] for i in I[0]]
49
+
50
+ def generate_notes_questions(text, groq_api_key, model="meta-llama/llama-4-scout-17b-16e-instruct"):
51
+ prompt = f"""
52
+ You are an educational assistant. Given the following content, generate:
53
+ 1. Bullet point summary notes.
54
+ 2. Five comprehension questions with their answers.
55
+
56
+ --- BEGIN CONTENT ---
57
+ {text[:3000]}
58
+ --- END CONTENT ---
59
+ """
60
+
61
+ headers = {
62
+ "Authorization": f"Bearer {groq_api_key}",
63
+ "Content-Type": "application/json"
64
+ }
65
+ payload = {
66
+ "model": model,
67
+ "messages": [
68
+ {"role": "user", "content": prompt}
69
+ ],
70
+ "temperature": 0.7
71
+ }
72
+
73
+ url = "https://api.groq.com/openai/v1/chat/completions"
74
+ response = requests.post(url, headers=headers, json=payload)
75
+ response.raise_for_status()
76
+ return response.json()['choices'][0]['message']['content']