from __future__ import annotations from pathlib import Path from reportlab.lib import colors from reportlab.lib.enums import TA_LEFT from reportlab.lib.pagesizes import A4 from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet from reportlab.lib.units import cm from reportlab.platypus import ( Image, PageBreak, Paragraph, SimpleDocTemplate, Spacer, Table, TableStyle, ) from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfbase import pdfmetrics from PIL import Image as PILImage ROOT = Path(__file__).resolve().parent OUT = ROOT / "solution_description.pdf" ASSET = ROOT / "assets" try: pdfmetrics.registerFont(TTFont("HelveticaNeue", "/System/Library/Fonts/Helvetica.ttc")) except Exception: pass PAGE_W, PAGE_H = A4 MARGIN = 1.55 * cm ACCENT = colors.HexColor("#A54F2B") INK = colors.HexColor("#211C17") MUTED = colors.HexColor("#6F665B") LINE = colors.HexColor("#D8CAB8") PAPER = colors.HexColor("#FFF9EF") styles = getSampleStyleSheet() styles.add(ParagraphStyle( name="TitleBig", parent=styles["Title"], fontName="Helvetica-Bold", fontSize=24, leading=28, textColor=INK, spaceAfter=8, alignment=TA_LEFT, )) styles.add(ParagraphStyle( name="Sub", parent=styles["BodyText"], fontName="Helvetica", fontSize=10.5, leading=14, textColor=MUTED, spaceAfter=14, )) styles.add(ParagraphStyle( name="H", parent=styles["Heading2"], fontName="Helvetica-Bold", fontSize=14.5, leading=18, textColor=ACCENT, spaceBefore=10, spaceAfter=7, )) styles.add(ParagraphStyle( name="Body", parent=styles["BodyText"], fontName="Helvetica", fontSize=9.7, leading=13.2, textColor=INK, spaceAfter=7, )) styles.add(ParagraphStyle( name="Small", parent=styles["BodyText"], fontName="Helvetica", fontSize=8.2, leading=10.5, textColor=MUTED, spaceAfter=4, )) styles.add(ParagraphStyle( name="MyBullet", parent=styles["BodyText"], fontName="Helvetica", fontSize=9.4, leading=12.5, textColor=INK, leftIndent=12, bulletIndent=3, spaceAfter=4, )) def p(text: str, style="Body"): return Paragraph(text, styles[style]) def bullet(text: str): return Paragraph(text, styles["MyBullet"], bulletText="-") def img_flowable(path: Path, max_w: float, max_h: float): im = PILImage.open(path) w, h = im.size scale = min(max_w / w, max_h / h) return Image(str(path), width=w * scale, height=h * scale) def page_footer(canvas, doc): canvas.saveState() canvas.setStrokeColor(LINE) canvas.line(MARGIN, 1.05 * cm, PAGE_W - MARGIN, 1.05 * cm) canvas.setFont("Helvetica", 8) canvas.setFillColor(MUTED) canvas.drawString(MARGIN, 0.68 * cm, "PortraitCraft Track 2 Solution Description") canvas.drawRightString(PAGE_W - MARGIN, 0.68 * cm, f"{doc.page}") canvas.restoreState() def callout(title: str, body: str): tbl = Table([[p(f"{title}
{body}", "Body")]], colWidths=[PAGE_W - 2 * MARGIN]) tbl.setStyle(TableStyle([ ("BACKGROUND", (0, 0), (-1, -1), PAPER), ("BOX", (0, 0), (-1, -1), 0.8, LINE), ("LEFTPADDING", (0, 0), (-1, -1), 12), ("RIGHTPADDING", (0, 0), (-1, -1), 12), ("TOPPADDING", (0, 0), (-1, -1), 9), ("BOTTOMPADDING", (0, 0), (-1, -1), 8), ])) return tbl story = [] story.append(p("PortraitCraft Track 2 Solution Description", "TitleBig")) story.append(p("Track 2: Portrait Composition Generation", "Sub")) story.append(callout( "Core idea", "We combine a portrait-composition fine-tuned Qwen-Image model with a prompt-conditioned adaptive canvas policy. The model improves visual quality and prompt-to-layout alignment, while the canvas policy selects a suitable aspect ratio before generation." )) story.append(Spacer(1, 0.25 * cm)) story.append(p("1. Method Overview", "H")) story.append(p("Our solution is built on Qwen-Image and focuses on improving portrait composition generation through two complementary components: a portrait-composition fine-tuned generation model and a prompt-conditioned adaptive canvas policy. The generation model synthesizes visually coherent portrait images, while the canvas policy selects the generation aspect ratio before sampling.")) story.append(p("A fixed square canvas is not always appropriate for portrait composition. Close-up and centered portraits often work well on square canvases, full-body portraits benefit from vertical layouts, and environmental portraits or scenes with roads, coastlines, leading lines, or large negative space often need horizontal or wide canvases.")) story.append(p("2. Training Data And Fine-Tuning", "H")) story.append(p("We fine-tuned Qwen-Image using the official 4,500 PortraitCraft training samples together with an additional private portrait aesthetic-composition dataset curated by our team. The private data focuses on portrait layout, aesthetic framing, human-subject placement, environmental context, lighting balance, and composition consistency.")) story.append(p("We compared LoRA fine-tuning and full-parameter fine-tuning under the same inference settings. Full-parameter fine-tuning was selected for the final submission because it performed better for this task, especially in aesthetic quality, composition stability, and prompt-to-layout alignment.")) story.append(p("3. Adaptive Canvas Policy", "H")) story.append(p("We do not use a fixed 1:1 canvas for all images. Instead, we use a prompt-conditioned adaptive canvas policy. The policy reads the input prompt and a learned policy state, then outputs a canvas size before image generation. The longer side is normalized to 1584 pixels, while the shorter side is selected from a compact set of portrait-friendly aspect ratios.")) story.append(p("The policy was optimized on the training set through an iterative evolutionary-search procedure. The search adjusted keyword weights, decision thresholds, and candidate aspect-ratio choices. This lets the inference system preserve the intended spatial structure for different prompt types, including square portraits, full-body vertical portraits, and horizontal environmental portraits.")) story.append(callout("Reproducibility", "We release the final learned policy state together with the inference code. Reviewers can recover the same canvas selection used by our submission. For unseen prompts, the implementation falls back to a deterministic prompt-only rule policy.")) story.append(PageBreak()) story.append(p("4. Inference Pipeline", "H")) story.append(p("The final inference pipeline first applies the adaptive canvas selector to each prompt, then generates the image using the released PortraitCraft Track 2 checkpoint. The output images are saved with their original task filenames and packaged as a flat zip file.")) pipeline = Table([ [p("Input prompt", "Small"), p("Adaptive canvas policy", "Small"), p("Qwen-Image + PortraitCraft checkpoint", "Small"), p("Generated image", "Small")], [p("Text description and composition analysis", "Small"), p("Select width and height, longest side 1584", "Small"), p("50 sampling steps, CFG 4.0, fixed seed", "Small"), p("Flat image output for submission", "Small")], ], colWidths=[4.0*cm, 4.3*cm, 5.1*cm, 4.0*cm]) pipeline.setStyle(TableStyle([ ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#EFE2D0")), ("BACKGROUND", (0, 1), (-1, 1), PAPER), ("BOX", (0, 0), (-1, -1), 0.8, LINE), ("INNERGRID", (0, 0), (-1, -1), 0.5, LINE), ("VALIGN", (0, 0), (-1, -1), "TOP"), ("LEFTPADDING", (0, 0), (-1, -1), 8), ("RIGHTPADDING", (0, 0), (-1, -1), 8), ("TOPPADDING", (0, 0), (-1, -1), 8), ("BOTTOMPADDING", (0, 0), (-1, -1), 8), ])) story.append(pipeline) story.append(Spacer(1, 0.35 * cm)) story.append(p("5. Inference Configuration", "H")) for item in [ "Base model: Qwen-Image", "Checkpoint: portraitcraft-track2.safetensors", "Sampling steps: 50", "CFG scale: 4.0", "Seed: 346346", "Adaptive canvas longest side: 1584 pixels", ]: story.append(bullet(item)) story.append(p("The released GitHub repository contains inference scripts, the adaptive canvas policy implementation, the learned policy state, and submission packaging utilities. The model checkpoint is hosted on Hugging Face.")) story.append(p("6. Reproducibility Links", "H")) story.append(p("Code repository: https://github.com/w-Jessamine/portraitcraft-track2-solution")) story.append(p("Model repository: https://huggingface.co/Jessamine/portraitcraft-track2")) story.append(PageBreak()) story.append(p("7. Qualitative Examples", "H")) story.append(p("The following generated examples illustrate how the adaptive canvas policy supports different composition needs. Square layouts are used for centered portraits, vertical layouts preserve human-body framing and breathing room, and horizontal layouts support environmental context and directional visual flow.")) examples = [ ("Square portrait canvas", "square_portrait.jpg", "Centered or close portrait layouts benefit from a balanced 1:1 frame."), ("Vertical portrait canvas", "vertical_portrait.jpg", "Vertical canvases preserve subject height, headroom, and full-body framing."), ("Horizontal environmental portrait", "horizontal_environment.jpg", "Horizontal canvases help preserve scene context and lateral visual flow."), ("Wide environmental composition", "wide_environment.jpg", "Wider layouts support roads, coastlines, leading lines, and negative space."), ] rows = [] for title, file, caption in examples: rows.append([[img_flowable(ASSET / file, 7.8*cm, 5.2*cm), Spacer(1, 0.08*cm), p(f"{title}
{caption}", "Small")]]) # two-column table, each cell wraps image+caption cells = [] for i in range(0, len(rows), 2): cells.append([rows[i][0], rows[i+1][0]]) ex_table = Table(cells, colWidths=[8.45*cm, 8.45*cm]) ex_table.setStyle(TableStyle([ ("VALIGN", (0, 0), (-1, -1), "TOP"), ("LEFTPADDING", (0, 0), (-1, -1), 4), ("RIGHTPADDING", (0, 0), (-1, -1), 4), ("TOPPADDING", (0, 0), (-1, -1), 4), ("BOTTOMPADDING", (0, 0), (-1, -1), 10), ])) story.append(ex_table) story.append(PageBreak()) story.append(p("8. Summary", "H")) story.append(p("Our final solution uses a full-parameter fine-tuned Qwen-Image checkpoint and a prompt-conditioned adaptive canvas policy. The training data combines the official PortraitCraft training set with private portrait aesthetic-composition data. The adaptive canvas policy was optimized on the training data and released with the code to make canvas selection reproducible.")) story.append(p("This design improves portrait composition generation by matching the canvas to the prompt's spatial intent before image synthesis, rather than forcing every image into the same square frame.")) story.append(Spacer(1, 0.4*cm)) story.append(callout("Submitted artifacts", "The accompanying GitHub repository contains runnable inference code and policy files. The Hugging Face repository contains the final PortraitCraft Track 2 checkpoint and matching documentation.")) doc = SimpleDocTemplate( str(OUT), pagesize=A4, leftMargin=MARGIN, rightMargin=MARGIN, topMargin=1.45*cm, bottomMargin=1.35*cm, title="PortraitCraft Track 2 Solution Description", ) doc.build(story, onFirstPage=page_footer, onLaterPages=page_footer) print(OUT)