magiv1

Sleeping

File size: 5,729 Bytes

import streamlit as st
import torch
import numpy as np
import urllib.request
from PIL import Image
from transformers import AutoModel

# ===============================
# CONFIGURAÇÕES GLOBAIS (CPU ONLY)
# ===============================
torch.set_num_threads(2)
torch.set_grad_enabled(False)

st.set_page_config(
    page_title="Manga Whisperer",
    layout="wide"
)

# ===============================
# LOAD MODEL (CACHEADO)
# ===============================
@st.cache_resource
def load_model():
    model = AutoModel.from_pretrained(
        "ragavsachdeva/magi",
        trust_remote_code=True
    )
    model.eval()
    return model

model = load_model()

# ===============================
# IMAGE LOADER
# ===============================
@st.cache_data(show_spinner=False)
def read_image_as_np_array(image_input):
    if isinstance(image_input, str) and image_input.startswith("http"):
        image = Image.open(
            urllib.request.urlopen(image_input)
        ).convert("L").convert("RGB")
    else:
        image = Image.open(image_input).convert("L").convert("RGB")

    return np.array(image)

# ===============================
# DETECTION STAGE
# ===============================
@st.cache_data(show_spinner=True)
def run_detection(image_input, params):
    image = read_image_as_np_array(image_input)

    with torch.inference_mode():
        result = model.predict_detections_and_associations(
            [image],
            **params
        )[0]

    return image, result

# ===============================
# OCR STAGE
# ===============================
@st.cache_data(show_spinner=True)
def run_ocr(image, detection_result):
    if not detection_result["texts"]:
        return None

    text_bboxes = [detection_result["texts"]]

    with torch.inference_mode():
        ocr_results = model.predict_ocr(
            [image],
            text_bboxes
        )

    return ocr_results[0]

# ===============================
# TRANSCRIPT STAGE
# ===============================
def generate_transcript(detection_result, ocr_result):
    if ocr_result is None:
        return "Nenhum texto detectado."

    return model.generate_transcript_for_single_image(
        detection_result,
        ocr_result
    )

def generate_structural_dialogue(detection_result, threshold=0.4):
    texts = detection_result.get("texts", [])
    characters = detection_result.get("characters", [])
    scores = detection_result.get(
        "text_character_matching_scores", []
    )

    dialogue_lines = []

    for text_id in range(len(texts)):
        if text_id < len(scores) and scores[text_id]:
            char_scores = scores[text_id]
            best_char = max(
                range(len(char_scores)),
                key=lambda i: char_scores[i]
            )
            best_score = char_scores[best_char]

            if best_score >= threshold:
                line = (
                    f"Text {text_id} → "
                    f"Character {best_char} "
                    f"(score: {best_score:.2f})"
                )
            else:
                line = f"Text {text_id} → Narration / Uncertain"
        else:
            line = f"Text {text_id} → Narration / Uncertain"

        dialogue_lines.append(line)

    return "\n".join(dialogue_lines)



# ===============================
# UI
# ===============================
st.markdown(
    """
    <style>
    .title {
        font-size: 2.2em;
        text-align: center;
        color: #ffffff;
        font-family: 'Comic Sans MS', cursive;
        margin-bottom: 0.2em;
    }
    .subtitle {
        font-size: 1.2em;
        text-align: center;
        color: #cccccc;
        margin-bottom: 1em;
    }
    </style>
    <div class="title">Manga Whisperer</div>
    <div class="subtitle">Automatic Comic Transcription (CPU Optimized)</div>
    """,
    unsafe_allow_html=True
)

# ===============================
# SIDEBAR
# ===============================
st.sidebar.markdown("### Mode")
generate_detections = st.sidebar.toggle(
    "Generate detections", True
)
generate_transcript_toggle = st.sidebar.toggle(
    "Generate transcript (slow)", False
)

st.sidebar.markdown("### Thresholds")
params = dict(
    character_detection_threshold=st.sidebar.slider(
        "Character detection", 0.0, 1.0, 0.30, 0.01
    ),
    panel_detection_threshold=st.sidebar.slider(
        "Panel detection", 0.0, 1.0, 0.20, 0.01
    ),
    text_detection_threshold=st.sidebar.slider(
        "Text detection", 0.0, 1.0, 0.25, 0.01
    ),
    character_character_matching_threshold=st.sidebar.slider(
        "Character-character matching", 0.0, 1.0, 0.70, 0.01
    ),
    text_character_matching_threshold=st.sidebar.slider(
        "Text-character matching", 0.0, 1.0, 0.40, 0.01
    ),
)

# ===============================
# INPUT IMAGE
# ===============================
image_input = st.file_uploader(
    "Upload an image",
    type=["png", "jpg", "jpeg"]
)

# ===============================
# MAIN PIPELINE
# ===============================
if image_input is not None:

    st.markdown("### Prediction")

    # 1️⃣ DETECTION
    image, detection_result = run_detection(
        image_input,
        params
    )

    # 2️⃣ VISUALIZATION
    if generate_detections:
        vis = model.visualise_single_image_prediction(
            image,
            detection_result
        )
        st.image(vis, caption="Detections")

    # 3️⃣ STRUCTURAL DIALOGUE (NO OCR)
    if generate_transcript_toggle:
        structural_dialogue = generate_structural_dialogue(
            detection_result
        )
        st.text_area(
            "Structural Dialogue (MAGI output)",
            structural_dialogue,
            height=300
        )