Pavaas commited on
Commit
b5d5f6a
·
verified ·
1 Parent(s): 958177e

Update config.py

Browse files
Files changed (1) hide show
  1. config.py +75 -52
config.py CHANGED
@@ -1,5 +1,4 @@
1
  import fitz # PyMuPDF
2
- import pytesseract
3
  import easyocr
4
  import whisper
5
  import tempfile
@@ -10,25 +9,30 @@ import docx
10
  import yt_dlp
11
  import csv
12
  from transformers import pipeline
13
- from PIL import Image
14
-
15
- # === Extract Text From Sources ===
16
 
17
  def process_pdf(path):
18
  text = ""
19
- doc = fitz.open(path)
 
 
 
 
 
20
  for page in doc:
21
  t = page.get_text()
22
  if t.strip():
23
  text += t
24
  else:
25
  pix = page.get_pixmap()
26
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
27
- text += pytesseract.image_to_string(img)
 
 
28
  return text
29
 
30
  def process_image(path):
31
- reader = easyocr.Reader(['en'])
32
  result = reader.readtext(path, detail=0)
33
  return "\n".join(result)
34
 
@@ -63,66 +67,85 @@ def process_youtube(url):
63
  ydl.download([url])
64
  return process_audio(audio_path)
65
 
66
- # === Flashcard Generator ===
 
 
 
 
 
 
 
 
 
67
 
68
- def generate_flashcards(text, model_name="t5-base", types=["Q&A"]):
69
- generator = pipeline("text2text-generation", model=model_name, max_length=64)
70
  chunks = [text[i:i + 400] for i in range(0, len(text), 400)]
 
71
  cards = []
72
 
 
73
  for chunk in chunks:
74
  if "Q&A" in types:
75
- qa = generator(f"Generate a question and answer from:\n{chunk}")[0]['generated_text']
76
- q, a = qa.split(":", 1) if ":" in qa else ("Question", qa)
77
- cards.append({"question": q.strip(), "answer": a.strip(), "tag": "Q&A"})
78
-
79
  if "Cloze" in types:
80
- cloze = generator(f"Create a cloze deletion flashcard from:\n{chunk}")[0]['generated_text']
81
- cards.append({"question": cloze.strip(), "answer": "[...]", "tag": "Cloze"})
82
-
83
  if "MCQ" in types:
84
- mcq = generator(f"Generate a multiple choice question from:\n{chunk}")[0]['generated_text']
85
- cards.append({"question": mcq.strip(), "answer": "Choose best option", "tag": "MCQ"})
86
-
87
  if "Reverse" in types:
88
- qa = generator(f"Generate a question and answer from:\n{chunk}")[0]['generated_text']
89
- q, a = qa.split(":", 1) if ":" in qa else ("Question", qa)
90
- cards.append({"question": a.strip(), "answer": q.strip(), "tag": "Reverse"})
91
-
92
- if len(cards) >= 20:
93
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  return cards
96
 
97
- # === Exporters ===
 
 
 
 
 
98
 
99
- def export_to_apkg(cards, deck_name):
100
- deck_id = int(str(uuid.uuid4().int)[:10])
101
- my_deck = genanki.Deck(deck_id, deck_name)
102
  model = genanki.Model(
103
  1607392319,
104
- 'BatAnkiModel',
105
- fields=[{'name': 'Question'}, {'name': 'Answer'}, {'name': 'Tag'}],
106
  templates=[{
107
- 'name': 'Card 1',
108
- 'qfmt': '{{Question}}<br><i>Tag: {{Tag}}</i>',
109
- 'afmt': '{{FrontSide}}<hr id="answer">{{Answer}}',
110
  }]
111
  )
 
112
  for card in cards:
113
- my_deck.add_note(genanki.Note(
114
- model=model,
115
- fields=[card['question'], card['answer'], card.get('tag', "")]
116
- ))
117
- pkg_path = os.path.join(tempfile.gettempdir(), f"{deck_name}.apkg")
118
- genanki.Package(my_deck).write_to_file(pkg_path)
119
- return pkg_path
120
-
121
- def export_to_csv(cards, deck_name):
122
- path = os.path.join(tempfile.gettempdir(), f"{deck_name}.csv")
123
- with open(path, "w", newline="", encoding="utf-8") as f:
124
- writer = csv.writer(f)
125
- writer.writerow(["Question", "Answer", "Tag"])
126
- for card in cards:
127
- writer.writerow([card["question"], card["answer"], card.get("tag", "")])
128
- return path
 
1
  import fitz # PyMuPDF
 
2
  import easyocr
3
  import whisper
4
  import tempfile
 
9
  import yt_dlp
10
  import csv
11
  from transformers import pipeline
12
+ import streamlit as st
 
 
13
 
14
  def process_pdf(path):
15
  text = ""
16
+ try:
17
+ doc = fitz.open(path)
18
+ except Exception as e:
19
+ st.error(f"❌ Could not open PDF: {str(e)}")
20
+ return ""
21
+ reader = easyocr.Reader(['en'], gpu=False)
22
  for page in doc:
23
  t = page.get_text()
24
  if t.strip():
25
  text += t
26
  else:
27
  pix = page.get_pixmap()
28
+ img_path = f"/tmp/{uuid.uuid4()}.png"
29
+ pix.save(img_path)
30
+ result = reader.readtext(img_path, detail=0)
31
+ text += "\n".join(result)
32
  return text
33
 
34
  def process_image(path):
35
+ reader = easyocr.Reader(['en'], gpu=False)
36
  result = reader.readtext(path, detail=0)
37
  return "\n".join(result)
38
 
 
67
  ydl.download([url])
68
  return process_audio(audio_path)
69
 
70
+ def load_llm_swarm():
71
+ return {
72
+ "fast": pipeline("text2text-generation", model="google/flan-t5-small", max_length=64),
73
+ "bio": pipeline("text2text-generation", model="microsoft/BioGPT-Large", tokenizer="microsoft/BioGPT-Large"),
74
+ "deep": pipeline("text2text-generation", model="tiiuae/falcon-7b-instruct"),
75
+ "mistral": pipeline("text2text-generation", model="mistralai/Mistral-7B-Instruct"),
76
+ "fallback": pipeline("text2text-generation", model="MBZUAI/LaMini-Flan-T5-783M")
77
+ }
78
+
79
+ llm_swarm = load_llm_swarm()
80
 
81
+ def generate_flashcards(text, types=["Q&A"], max_cards=100):
82
+ from random import choice
83
  chunks = [text[i:i + 400] for i in range(0, len(text), 400)]
84
+ chunks = chunks[:max_cards]
85
  cards = []
86
 
87
+ prompts, tags = [], []
88
  for chunk in chunks:
89
  if "Q&A" in types:
90
+ prompts.append(f"Generate a question and answer:\n{chunk}")
91
+ tags.append("Q&A")
 
 
92
  if "Cloze" in types:
93
+ prompts.append(f"Make a cloze deletion from:\n{chunk}")
94
+ tags.append("Cloze")
 
95
  if "MCQ" in types:
96
+ prompts.append(f"Generate a multiple choice question:\n{chunk}")
97
+ tags.append("MCQ")
 
98
  if "Reverse" in types:
99
+ prompts.append(f"Generate a question and answer:\n{chunk}")
100
+ tags.append("Reverse")
101
+
102
+ for i, prompt in enumerate(prompts):
103
+ engine_name = choice(list(llm_swarm.keys()))
104
+ engine = llm_swarm[engine_name]
105
+ tag = tags[i]
106
+ try:
107
+ output = engine(prompt, max_length=128)[0]["generated_text"]
108
+ except:
109
+ output = llm_swarm["fallback"](prompt, max_length=64)[0]["generated_text"]
110
+
111
+ if tag in ["Q&A", "Reverse"]:
112
+ if ":" in output:
113
+ q, a = output.split(":", 1)
114
+ else:
115
+ q, a = "Question", output
116
+ if tag == "Reverse":
117
+ q, a = a.strip(), q.strip()
118
+ cards.append({"question": q.strip(), "answer": a.strip(), "tag": tag})
119
+ elif tag == "Cloze":
120
+ cards.append({"question": output.strip(), "answer": "[...]", "tag": tag})
121
+ elif tag == "MCQ":
122
+ cards.append({"question": output.strip(), "answer": "Choose best option", "tag": tag})
123
 
124
  return cards
125
 
126
+ def export_to_csv(cards, filename="batanki_cards.csv"):
127
+ with open(filename, "w", newline="", encoding="utf-8") as f:
128
+ writer = csv.writer(f)
129
+ writer.writerow(["Question", "Answer", "Type"])
130
+ for card in cards:
131
+ writer.writerow([card["question"], card["answer"], card["tag"]])
132
 
133
+ def export_to_apkg(cards, deck_name="BatAnkiDeck"):
134
+ deck_id = int(uuid.uuid4()) >> 64
 
135
  model = genanki.Model(
136
  1607392319,
137
+ "BatAnkiModel",
138
+ fields=[{"name": "Question"}, {"name": "Answer"}],
139
  templates=[{
140
+ "name": "Card 1",
141
+ "qfmt": "{{Question}}",
142
+ "afmt": "{{FrontSide}}<hr id='answer'>{{Answer}}",
143
  }]
144
  )
145
+ deck = genanki.Deck(deck_id, deck_name)
146
  for card in cards:
147
+ note = genanki.Note(model=model, fields=[card["question"], card["answer"]])
148
+ deck.add_note(note)
149
+ output_path = f"{deck_name}.apkg"
150
+ genanki.Package(deck).write_to_file(output_path)
151
+ return output_path