Spaces:
Runtime error
Runtime error
| import fitz # PyMuPDF | |
| import easyocr | |
| import whisper | |
| import tempfile | |
| import os | |
| import uuid | |
| import genanki | |
| import docx | |
| import yt_dlp | |
| import csv | |
| from transformers import pipeline | |
| import streamlit as st | |
| def process_pdf(path): | |
| text = "" | |
| try: | |
| doc = fitz.open(path) | |
| except Exception as e: | |
| st.error(f"❌ Could not open PDF: {str(e)}") | |
| return "" | |
| reader = easyocr.Reader(['en'], gpu=False) | |
| for page in doc: | |
| t = page.get_text() | |
| if t.strip(): | |
| text += t | |
| else: | |
| pix = page.get_pixmap() | |
| img_path = f"/tmp/{uuid.uuid4()}.png" | |
| pix.save(img_path) | |
| result = reader.readtext(img_path, detail=0) | |
| text += "\n".join(result) | |
| return text | |
| def process_image(path): | |
| reader = easyocr.Reader(['en'], gpu=False) | |
| result = reader.readtext(path, detail=0) | |
| return "\n".join(result) | |
| def process_audio(path): | |
| model = whisper.load_model("base") | |
| result = model.transcribe(path) | |
| return result["text"] | |
| def process_text(path): | |
| if path.endswith(".txt"): | |
| with open(path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| elif path.endswith(".docx"): | |
| doc = docx.Document(path) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| return "" | |
| def process_youtube(url): | |
| temp_dir = tempfile.gettempdir() | |
| audio_path = os.path.join(temp_dir, f"{uuid.uuid4()}.mp3") | |
| ydl_opts = { | |
| 'format': 'bestaudio/best', | |
| 'outtmpl': audio_path, | |
| 'postprocessors': [{ | |
| 'key': 'FFmpegExtractAudio', | |
| 'preferredcodec': 'mp3', | |
| 'preferredquality': '192', | |
| }], | |
| 'quiet': True, | |
| } | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| ydl.download([url]) | |
| return process_audio(audio_path) | |
| def load_llm_swarm(): | |
| return { | |
| "fast": pipeline("text2text-generation", model="google/flan-t5-small", max_length=64), | |
| "bio": pipeline("text2text-generation", model="microsoft/BioGPT-Large", tokenizer="microsoft/BioGPT-Large"), | |
| "deep": pipeline("text2text-generation", model="tiiuae/falcon-7b-instruct"), | |
| "mistral": pipeline("text2text-generation", model="mistralai/Mistral-7B-Instruct"), | |
| "fallback": pipeline("text2text-generation", model="MBZUAI/LaMini-Flan-T5-783M") | |
| } | |
| llm_swarm = load_llm_swarm() | |
| def generate_flashcards(text, types=["Q&A"], max_cards=100): | |
| from random import choice | |
| chunks = [text[i:i + 400] for i in range(0, len(text), 400)] | |
| chunks = chunks[:max_cards] | |
| cards = [] | |
| prompts, tags = [], [] | |
| for chunk in chunks: | |
| if "Q&A" in types: | |
| prompts.append(f"Generate a question and answer:\n{chunk}") | |
| tags.append("Q&A") | |
| if "Cloze" in types: | |
| prompts.append(f"Make a cloze deletion from:\n{chunk}") | |
| tags.append("Cloze") | |
| if "MCQ" in types: | |
| prompts.append(f"Generate a multiple choice question:\n{chunk}") | |
| tags.append("MCQ") | |
| if "Reverse" in types: | |
| prompts.append(f"Generate a question and answer:\n{chunk}") | |
| tags.append("Reverse") | |
| for i, prompt in enumerate(prompts): | |
| engine_name = choice(list(llm_swarm.keys())) | |
| engine = llm_swarm[engine_name] | |
| tag = tags[i] | |
| try: | |
| output = engine(prompt, max_length=128)[0]["generated_text"] | |
| except: | |
| output = llm_swarm["fallback"](prompt, max_length=64)[0]["generated_text"] | |
| if tag in ["Q&A", "Reverse"]: | |
| if ":" in output: | |
| q, a = output.split(":", 1) | |
| else: | |
| q, a = "Question", output | |
| if tag == "Reverse": | |
| q, a = a.strip(), q.strip() | |
| cards.append({"question": q.strip(), "answer": a.strip(), "tag": tag}) | |
| elif tag == "Cloze": | |
| cards.append({"question": output.strip(), "answer": "[...]", "tag": tag}) | |
| elif tag == "MCQ": | |
| cards.append({"question": output.strip(), "answer": "Choose best option", "tag": tag}) | |
| return cards | |
| def export_to_csv(cards, filename="batanki_cards.csv"): | |
| with open(filename, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["Question", "Answer", "Type"]) | |
| for card in cards: | |
| writer.writerow([card["question"], card["answer"], card["tag"]]) | |
| def export_to_apkg(cards, deck_name="BatAnkiDeck"): | |
| deck_id = int(uuid.uuid4()) >> 64 | |
| model = genanki.Model( | |
| 1607392319, | |
| "BatAnkiModel", | |
| fields=[{"name": "Question"}, {"name": "Answer"}], | |
| templates=[{ | |
| "name": "Card 1", | |
| "qfmt": "{{Question}}", | |
| "afmt": "{{FrontSide}}<hr id='answer'>{{Answer}}", | |
| }] | |
| ) | |
| deck = genanki.Deck(deck_id, deck_name) | |
| for card in cards: | |
| note = genanki.Note(model=model, fields=[card["question"], card["answer"]]) | |
| deck.add_note(note) | |
| output_path = f"{deck_name}.apkg" | |
| genanki.Package(deck).write_to_file(output_path) | |
| return output_path |