import streamlit as st import torch import numpy as np import urllib.request from PIL import Image from transformers import AutoModel # =============================== # CONFIGURAÇÕES GLOBAIS (CPU ONLY) # =============================== torch.set_num_threads(2) torch.set_grad_enabled(False) st.set_page_config( page_title="Manga Whisperer", layout="wide" ) # =============================== # LOAD MODEL (CACHEADO) # =============================== @st.cache_resource def load_model(): model = AutoModel.from_pretrained( "ragavsachdeva/magi", trust_remote_code=True ) model.eval() return model model = load_model() # =============================== # IMAGE LOADER # =============================== @st.cache_data(show_spinner=False) def read_image_as_np_array(image_input): if isinstance(image_input, str) and image_input.startswith("http"): image = Image.open( urllib.request.urlopen(image_input) ).convert("L").convert("RGB") else: image = Image.open(image_input).convert("L").convert("RGB") return np.array(image) # =============================== # DETECTION STAGE # =============================== @st.cache_data(show_spinner=True) def run_detection(image_input, params): image = read_image_as_np_array(image_input) with torch.inference_mode(): result = model.predict_detections_and_associations( [image], **params )[0] return image, result # =============================== # OCR STAGE # =============================== @st.cache_data(show_spinner=True) def run_ocr(image, detection_result): if not detection_result["texts"]: return None text_bboxes = [detection_result["texts"]] with torch.inference_mode(): ocr_results = model.predict_ocr( [image], text_bboxes ) return ocr_results[0] # =============================== # TRANSCRIPT STAGE # =============================== def generate_transcript(detection_result, ocr_result): if ocr_result is None: return "Nenhum texto detectado." return model.generate_transcript_for_single_image( detection_result, ocr_result ) def generate_structural_dialogue(detection_result, threshold=0.4): texts = detection_result.get("texts", []) characters = detection_result.get("characters", []) scores = detection_result.get( "text_character_matching_scores", [] ) dialogue_lines = [] for text_id in range(len(texts)): if text_id < len(scores) and scores[text_id]: char_scores = scores[text_id] best_char = max( range(len(char_scores)), key=lambda i: char_scores[i] ) best_score = char_scores[best_char] if best_score >= threshold: line = ( f"Text {text_id} → " f"Character {best_char} " f"(score: {best_score:.2f})" ) else: line = f"Text {text_id} → Narration / Uncertain" else: line = f"Text {text_id} → Narration / Uncertain" dialogue_lines.append(line) return "\n".join(dialogue_lines) # =============================== # UI # =============================== st.markdown( """