# ============================================================================== # 1) INSTALACJA PAKIETÓW # ============================================================================== from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig from IPython.display import display from io import BytesIO from PIL import Image, ImageDraw import math import json import torch import requests import re !pip -q install -U "transformers" "huggingface_hub" "accelerate" "timm" "sentencepiece" "safensors" "pillow" "einops" "pytorch_metric_learning" # ============================================================================== # 2) IMPORTY # ============================================================================== # ============================================================================== # 3) POBRANIE OBRAZU # ============================================================================== # def download_imgbb_image(page_url): # print(f"Pobieranie obrazu ze strony: {page_url}") # html = requests.get(page_url).text # img_url = re.search(r'https://i\.ibb\.co/[A-Za-z0-9/_\-]+\.(?:png|jpg|jpeg|webp)', html).group(0) # print(f"Znaleziono bezpośredni link: {img_url}") # img_bytes = requests.get(img_url).content # return Image.open(BytesIO(img_bytes)).convert("RGB") # page_url = "https://ibb.co/cchLK038" # pil_img = download_imgbb_image(page_url) # print("Obraz został pomyślnie pobrany.") pil_img = Image.open("./1.png").convert("RGB") # ============================================================================== # 4) ZAŁADOWANIE MODELU I PROCESORA # ============================================================================== device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 print(f"\nUżywane urządzenie: {device}, typ danych: {dtype}") model_id = "MattyMroz/magiv3" processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) config._attn_implementation = "eager" model = AutoModelForCausalLM.from_pretrained( model_id, config=config, trust_remote_code=True, torch_dtype=dtype ).to(device).eval() if not hasattr(model, "_supports_sdpa"): setattr(model, "_supports_sdpa", False) print("Model i procesor załadowane pomyślnie.") # ============================================================================== # 5) ZAAWANSOWANA WIZUALIZACJA I PRZETWARZANIE # ============================================================================== def create_visualization(image, data, detailed_mode=False): """ Rysuje zaawansowaną wizualizację detekcji i asocjacji na obrazie. Args: image: Obraz wejściowy data: Dane JSON z wynikami detailed_mode: Jeśli True, rysuje wszystko z JSON (OCR, grounding). Jeśli False (domyślnie), rysuje tylko detekcje i asocjacje. """ img_draw = image.copy() draw = ImageDraw.Draw(img_draw) # ZMIANA: Zaktualizowana paleta kolorów i grubości linii colors = { "panels": "green", "texts": "red", "characters": "blue", "tails": "purple", "cluster_colors": ["#f50a8f", "#4b13b6", "#ddaa34", "#b7ff51", "#bea2a2"], "speaker_line": "magenta", "ocr": "orange", "grounding": "cyan", } line_widths = {"panels": 2, "texts": 1, "characters": 2, "tails": 1, "ocr": 2, "grounding": 2} def get_box_center(box): x1, y1, x2, y2 = box return (x1 + x2) / 2, (y1 + y2) / 2 def draw_dashed_line(draw_obj, p1, p2, fill, width, dash_len=10): x1, y1 = p1 x2, y2 = p2 dx, dy = x2 - x1, y2 - y1 dist = math.sqrt(dx**2 + dy**2) if dist == 0: return for i in range(0, int(dist / dash_len), 2): start = (x1 + (dx * i * dash_len) / dist, y1 + (dy * i * dash_len) / dist) end = (x1 + (dx * (i + 1) * dash_len) / dist, y1 + (dy * (i + 1) * dash_len) / dist) draw_obj.line([start, end], fill=fill, width=width) # Rysowanie Bounding Boxów for category, bboxes in data.get("detections", {}).items(): if category in colors: for box in bboxes: draw.rectangle( box, outline=colors[category], width=line_widths.get(category, 1)) # Rysowanie Klastrów Postaci clusters = data.get("associations", {}).get("character_cluster_labels", []) characters = data.get("detections", {}).get("characters", []) if clusters and characters: unique_labels = sorted(list(set(clusters))) for i, label in enumerate(unique_labels): color = colors["cluster_colors"][i % len(colors["cluster_colors"])] indices = [j for j, l in enumerate(clusters) if l == label] if len(indices) > 1: for k in range(len(indices) - 1): p1 = get_box_center(characters[indices[k]]) p2 = get_box_center(characters[indices[k+1]]) draw.line([p1, p2], fill=color, width=2) # Rysowanie Linii Mówców texts = data.get("detections", {}).get("texts", []) speaker_associations = data.get("associations", {}).get( "text_character_associations", []) if speaker_associations and texts and characters: for text_idx, char_idx in speaker_associations: if text_idx < len(texts) and char_idx < len(characters): p1 = get_box_center(texts[text_idx]) p2 = get_box_center(characters[char_idx]) draw_dashed_line( draw, p1, p2, fill=colors["speaker_line"], width=1) # Tryb wybredny - rysowanie dodatkowych elementów z JSON if detailed_mode: # Rysowanie OCR boxes ocr_data = data.get("ocr", []) for ocr_item in ocr_data: box = ocr_item.get("box") if box: draw.rectangle(box, outline=colors["ocr"], width=line_widths["ocr"]) # Rysowanie Grounding boxes grounding_data = data.get("grounding", []) for grounding_item in grounding_data: boxes = grounding_item.get("boxes", []) for box in boxes: draw.rectangle(box, outline=colors["grounding"], width=line_widths["grounding"]) return img_draw def process_image(image, caption_for_grounding="elf girl", detailed_mode=False): """ Przetwarza obraz i tworzy wizualizację. Args: image: Obraz wejściowy caption_for_grounding: Caption dla character grounding detailed_mode: Jeśli True, wizualizacja zawiera wszystko z JSON (OCR, grounding). Jeśli False (domyślnie), tylko detekcje i asocjacje. """ print("\n--- Rozpoczynanie przetwarzania obrazu ---") images = [image] captions = [caption_for_grounding] print("1/3: Uruchamianie OCR...") ocr_results = model.predict_ocr(images, processor)[0] print("2/3: Uruchamianie detekcji i asocjacji...") detection_results = model.predict_detections_and_associations(images, processor)[ 0] print("3/3: Uruchamianie 'Character Grounding'...") grounding_results = model.predict_character_grounding( images, captions, processor)[0] final_json = { "ocr": [{"text": text, "box": box} for text, box in zip(ocr_results.get("ocr_texts", []), ocr_results.get("bboxes", []))], "detections": {"panels": detection_results.get("panels", []), "texts": detection_results.get("texts", []), "characters": detection_results.get("characters", []), "tails": detection_results.get("tails", [])}, "associations": {"character_cluster_labels": detection_results.get("character_cluster_labels", []), "text_character_associations": detection_results.get("text_character_associations", []), "text_tail_associations": detection_results.get("text_tail_associations", []), "is_essential_text": detection_results.get("is_essential_text", [])}, "grounding": [{"phrase": grounding_results.get("grounded_caption", "")[start:end], "boxes": boxes} for boxes, (start, end) in zip(grounding_results.get("bboxes", []), grounding_results.get("indices_of_bboxes_in_caption", []))] } mode_text = "wybredny (wszystkie elementy)" if detailed_mode else "domyślny (detekcje i asocjacje)" print(f"Tworzenie wizualizacji w trybie: {mode_text}") visualization_image = create_visualization(image, final_json, detailed_mode=detailed_mode) print("--- Zakończono przetwarzanie ---") return final_json, visualization_image # ============================================================================== # 6) URUCHOMIENIE I WYŚWIETLENIE WYNIKÓW # ============================================================================== # Tryb wizualizacji: # detailed_mode=False (domyślny) - rysuje tylko detekcje i asocjacje (obecne kolory) # detailed_mode=True (wybredny) - rysuje wszystko z JSON: OCR (pomarańczowy), grounding (cyjan) json_output, image_output = process_image( pil_img, caption_for_grounding="elf girl", detailed_mode=True) print("\n\n===== WYNIKI W FORMACIE JSON (przed filtrowaniem) =====") print(json.dumps(json_output, indent=2)) print("\n\n===== WIZUALIZACJA (przed filtrowaniem) =====") display(image_output)