BatAnki2.0

Runtime error

File size: 5,092 Bytes

cc9e5f0
 
 
b0e0069
cc9e5f0
 
b0e0069
 
 
 
 
b5d5f6a
cc9e5f0
b0e0069
cc9e5f0
b5d5f6a
 
 
 
 
 
cc9e5f0
b0e0069
 
 
 
cc9e5f0
b5d5f6a
 
 
 
cc9e5f0
 
b0e0069
b5d5f6a
b0e0069
cc9e5f0
 
b0e0069
cc9e5f0
b0e0069
cc9e5f0
 
b0e0069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc9e5f0
b5d5f6a
 
 
 
 
 
 
 
 
 
b0e0069
b5d5f6a
 
b0e0069
b5d5f6a
b0e0069
cc9e5f0
b5d5f6a
cc9e5f0
b0e0069
b5d5f6a
 
b0e0069
b5d5f6a
 
b0e0069
b5d5f6a
 
b0e0069
b5d5f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc9e5f0
b0e0069
 
b5d5f6a
 
 
 
 
 
cc9e5f0
b5d5f6a
 
cc9e5f0
 
b5d5f6a
 
b0e0069
b5d5f6a
 
 
b0e0069
 
b5d5f6a
cc9e5f0
b5d5f6a

import fitz  # PyMuPDF
import easyocr
import whisper
import tempfile
import os
import uuid
import genanki
import docx
import yt_dlp
import csv
from transformers import pipeline
import streamlit as st

def process_pdf(path):
    text = ""
    try:
        doc = fitz.open(path)
    except Exception as e:
        st.error(f"❌ Could not open PDF: {str(e)}")
        return ""
    reader = easyocr.Reader(['en'], gpu=False)
    for page in doc:
        t = page.get_text()
        if t.strip():
            text += t
        else:
            pix = page.get_pixmap()
            img_path = f"/tmp/{uuid.uuid4()}.png"
            pix.save(img_path)
            result = reader.readtext(img_path, detail=0)
            text += "\n".join(result)
    return text

def process_image(path):
    reader = easyocr.Reader(['en'], gpu=False)
    result = reader.readtext(path, detail=0)
    return "\n".join(result)

def process_audio(path):
    model = whisper.load_model("base")
    result = model.transcribe(path)
    return result["text"]

def process_text(path):
    if path.endswith(".txt"):
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
    elif path.endswith(".docx"):
        doc = docx.Document(path)
        return "\n".join([para.text for para in doc.paragraphs])
    return ""

def process_youtube(url):
    temp_dir = tempfile.gettempdir()
    audio_path = os.path.join(temp_dir, f"{uuid.uuid4()}.mp3")
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': audio_path,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'quiet': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return process_audio(audio_path)

def load_llm_swarm():
    return {
        "fast": pipeline("text2text-generation", model="google/flan-t5-small", max_length=64),
        "bio": pipeline("text2text-generation", model="microsoft/BioGPT-Large", tokenizer="microsoft/BioGPT-Large"),
        "deep": pipeline("text2text-generation", model="tiiuae/falcon-7b-instruct"),
        "mistral": pipeline("text2text-generation", model="mistralai/Mistral-7B-Instruct"),
        "fallback": pipeline("text2text-generation", model="MBZUAI/LaMini-Flan-T5-783M")
    }

llm_swarm = load_llm_swarm()

def generate_flashcards(text, types=["Q&A"], max_cards=100):
    from random import choice
    chunks = [text[i:i + 400] for i in range(0, len(text), 400)]
    chunks = chunks[:max_cards]
    cards = []

    prompts, tags = [], []
    for chunk in chunks:
        if "Q&A" in types:
            prompts.append(f"Generate a question and answer:\n{chunk}")
            tags.append("Q&A")
        if "Cloze" in types:
            prompts.append(f"Make a cloze deletion from:\n{chunk}")
            tags.append("Cloze")
        if "MCQ" in types:
            prompts.append(f"Generate a multiple choice question:\n{chunk}")
            tags.append("MCQ")
        if "Reverse" in types:
            prompts.append(f"Generate a question and answer:\n{chunk}")
            tags.append("Reverse")

    for i, prompt in enumerate(prompts):
        engine_name = choice(list(llm_swarm.keys()))
        engine = llm_swarm[engine_name]
        tag = tags[i]
        try:
            output = engine(prompt, max_length=128)[0]["generated_text"]
        except:
            output = llm_swarm["fallback"](prompt, max_length=64)[0]["generated_text"]

        if tag in ["Q&A", "Reverse"]:
            if ":" in output:
                q, a = output.split(":", 1)
            else:
                q, a = "Question", output
            if tag == "Reverse":
                q, a = a.strip(), q.strip()
            cards.append({"question": q.strip(), "answer": a.strip(), "tag": tag})
        elif tag == "Cloze":
            cards.append({"question": output.strip(), "answer": "[...]", "tag": tag})
        elif tag == "MCQ":
            cards.append({"question": output.strip(), "answer": "Choose best option", "tag": tag})

    return cards

def export_to_csv(cards, filename="batanki_cards.csv"):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Question", "Answer", "Type"])
        for card in cards:
            writer.writerow([card["question"], card["answer"], card["tag"]])

def export_to_apkg(cards, deck_name="BatAnkiDeck"):
    deck_id = int(uuid.uuid4()) >> 64
    model = genanki.Model(
        1607392319,
        "BatAnkiModel",
        fields=[{"name": "Question"}, {"name": "Answer"}],
        templates=[{
            "name": "Card 1",
            "qfmt": "{{Question}}",
            "afmt": "{{FrontSide}}<hr id='answer'>{{Answer}}",
        }]
    )
    deck = genanki.Deck(deck_id, deck_name)
    for card in cards:
        note = genanki.Note(model=model, fields=[card["question"], card["answer"]])
        deck.add_note(note)
    output_path = f"{deck_name}.apkg"
    genanki.Package(deck).write_to_file(output_path)
    return output_path