magiv3

File size: 9,426 Bytes

# ==============================================================================
# 1) INSTALACJA PAKIETÓW
# ==============================================================================
from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig
from IPython.display import display
from io import BytesIO
from PIL import Image, ImageDraw
import math
import json
import torch
import requests
import re
!pip -q install -U "transformers" "huggingface_hub" "accelerate" "timm" "sentencepiece" "safensors" "pillow" "einops" "pytorch_metric_learning"

# ==============================================================================
# 2) IMPORTY
# ==============================================================================

# ==============================================================================
# 3) POBRANIE OBRAZU
# ==============================================================================
# def download_imgbb_image(page_url):
#     print(f"Pobieranie obrazu ze strony: {page_url}")
#     html = requests.get(page_url).text
#     img_url = re.search(r'https://i\.ibb\.co/[A-Za-z0-9/_\-]+\.(?:png|jpg|jpeg|webp)', html).group(0)
#     print(f"Znaleziono bezpośredni link: {img_url}")
#     img_bytes = requests.get(img_url).content
#     return Image.open(BytesIO(img_bytes)).convert("RGB")

# page_url = "https://ibb.co/cchLK038"
# pil_img = download_imgbb_image(page_url)
# print("Obraz został pomyślnie pobrany.")
pil_img = Image.open("./1.png").convert("RGB")

# ==============================================================================
# 4) ZAŁADOWANIE MODELU I PROCESORA
# ==============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
print(f"\nUżywane urządzenie: {device}, typ danych: {dtype}")

model_id = "MattyMroz/magiv3"

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config._attn_implementation = "eager"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    config=config,
    trust_remote_code=True,
    torch_dtype=dtype
).to(device).eval()

if not hasattr(model, "_supports_sdpa"):
    setattr(model, "_supports_sdpa", False)

print("Model i procesor załadowane pomyślnie.")

# ==============================================================================
# 5) ZAAWANSOWANA WIZUALIZACJA I PRZETWARZANIE
# ==============================================================================


def create_visualization(image, data, detailed_mode=False):
    """
    Rysuje zaawansowaną wizualizację detekcji i asocjacji na obrazie.
    
    Args:
        image: Obraz wejściowy
        data: Dane JSON z wynikami
        detailed_mode: Jeśli True, rysuje wszystko z JSON (OCR, grounding).
                      Jeśli False (domyślnie), rysuje tylko detekcje i asocjacje.
    """
    img_draw = image.copy()
    draw = ImageDraw.Draw(img_draw)

    # ZMIANA: Zaktualizowana paleta kolorów i grubości linii
    colors = {
        "panels": "green",
        "texts": "red",
        "characters": "blue",
        "tails": "purple",
        "cluster_colors": ["#f50a8f", "#4b13b6", "#ddaa34", "#b7ff51", "#bea2a2"],
        "speaker_line": "magenta",
        "ocr": "orange",
        "grounding": "cyan",
    }
    line_widths = {"panels": 2, "texts": 1, "characters": 2, "tails": 1, "ocr": 2, "grounding": 2}

    def get_box_center(box):
        x1, y1, x2, y2 = box
        return (x1 + x2) / 2, (y1 + y2) / 2

    def draw_dashed_line(draw_obj, p1, p2, fill, width, dash_len=10):
        x1, y1 = p1
        x2, y2 = p2
        dx, dy = x2 - x1, y2 - y1
        dist = math.sqrt(dx**2 + dy**2)
        if dist == 0:
            return
        for i in range(0, int(dist / dash_len), 2):
            start = (x1 + (dx * i * dash_len) / dist,
                     y1 + (dy * i * dash_len) / dist)
            end = (x1 + (dx * (i + 1) * dash_len) / dist,
                   y1 + (dy * (i + 1) * dash_len) / dist)
            draw_obj.line([start, end], fill=fill, width=width)

    # Rysowanie Bounding Boxów
    for category, bboxes in data.get("detections", {}).items():
        if category in colors:
            for box in bboxes:
                draw.rectangle(
                    box, outline=colors[category], width=line_widths.get(category, 1))

    # Rysowanie Klastrów Postaci
    clusters = data.get("associations", {}).get("character_cluster_labels", [])
    characters = data.get("detections", {}).get("characters", [])
    if clusters and characters:
        unique_labels = sorted(list(set(clusters)))
        for i, label in enumerate(unique_labels):
            color = colors["cluster_colors"][i % len(colors["cluster_colors"])]
            indices = [j for j, l in enumerate(clusters) if l == label]
            if len(indices) > 1:
                for k in range(len(indices) - 1):
                    p1 = get_box_center(characters[indices[k]])
                    p2 = get_box_center(characters[indices[k+1]])
                    draw.line([p1, p2], fill=color, width=2)

    # Rysowanie Linii Mówców
    texts = data.get("detections", {}).get("texts", [])
    speaker_associations = data.get("associations", {}).get(
        "text_character_associations", [])
    if speaker_associations and texts and characters:
        for text_idx, char_idx in speaker_associations:
            if text_idx < len(texts) and char_idx < len(characters):
                p1 = get_box_center(texts[text_idx])
                p2 = get_box_center(characters[char_idx])
                draw_dashed_line(
                    draw, p1, p2, fill=colors["speaker_line"], width=1)

    # Tryb wybredny - rysowanie dodatkowych elementów z JSON
    if detailed_mode:
        # Rysowanie OCR boxes
        ocr_data = data.get("ocr", [])
        for ocr_item in ocr_data:
            box = ocr_item.get("box")
            if box:
                draw.rectangle(box, outline=colors["ocr"], width=line_widths["ocr"])
        
        # Rysowanie Grounding boxes
        grounding_data = data.get("grounding", [])
        for grounding_item in grounding_data:
            boxes = grounding_item.get("boxes", [])
            for box in boxes:
                draw.rectangle(box, outline=colors["grounding"], width=line_widths["grounding"])

    return img_draw


def process_image(image, caption_for_grounding="elf girl", detailed_mode=False):
    """
    Przetwarza obraz i tworzy wizualizację.
    
    Args:
        image: Obraz wejściowy
        caption_for_grounding: Caption dla character grounding
        detailed_mode: Jeśli True, wizualizacja zawiera wszystko z JSON (OCR, grounding).
                      Jeśli False (domyślnie), tylko detekcje i asocjacje.
    """
    print("\n--- Rozpoczynanie przetwarzania obrazu ---")
    images = [image]
    captions = [caption_for_grounding]

    print("1/3: Uruchamianie OCR...")
    ocr_results = model.predict_ocr(images, processor)[0]

    print("2/3: Uruchamianie detekcji i asocjacji...")
    detection_results = model.predict_detections_and_associations(images, processor)[
        0]

    print("3/3: Uruchamianie 'Character Grounding'...")
    grounding_results = model.predict_character_grounding(
        images, captions, processor)[0]

    final_json = {
        "ocr": [{"text": text, "box": box} for text, box in zip(ocr_results.get("ocr_texts", []), ocr_results.get("bboxes", []))],
        "detections": {"panels": detection_results.get("panels", []), "texts": detection_results.get("texts", []), "characters": detection_results.get("characters", []), "tails": detection_results.get("tails", [])},
        "associations": {"character_cluster_labels": detection_results.get("character_cluster_labels", []), "text_character_associations": detection_results.get("text_character_associations", []), "text_tail_associations": detection_results.get("text_tail_associations", []), "is_essential_text": detection_results.get("is_essential_text", [])},
        "grounding": [{"phrase": grounding_results.get("grounded_caption", "")[start:end], "boxes": boxes} for boxes, (start, end) in zip(grounding_results.get("bboxes", []), grounding_results.get("indices_of_bboxes_in_caption", []))]
    }

    mode_text = "wybredny (wszystkie elementy)" if detailed_mode else "domyślny (detekcje i asocjacje)"
    print(f"Tworzenie wizualizacji w trybie: {mode_text}")
    visualization_image = create_visualization(image, final_json, detailed_mode=detailed_mode)

    print("--- Zakończono przetwarzanie ---")
    return final_json, visualization_image

# ==============================================================================
# 6) URUCHOMIENIE I WYŚWIETLENIE WYNIKÓW
# ==============================================================================

# Tryb wizualizacji:
# detailed_mode=False (domyślny) - rysuje tylko detekcje i asocjacje (obecne kolory)
# detailed_mode=True (wybredny) - rysuje wszystko z JSON: OCR (pomarańczowy), grounding (cyjan)
json_output, image_output = process_image(
    pil_img, caption_for_grounding="elf girl", detailed_mode=True)

print("\n\n===== WYNIKI W FORMACIE JSON (przed filtrowaniem) =====")
print(json.dumps(json_output, indent=2))

print("\n\n===== WIZUALIZACJA (przed filtrowaniem) =====")
display(image_output)