| from __future__ import annotations |
|
|
| from pathlib import Path |
| from reportlab.lib import colors |
| from reportlab.lib.enums import TA_LEFT |
| from reportlab.lib.pagesizes import A4 |
| from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet |
| from reportlab.lib.units import cm |
| from reportlab.platypus import ( |
| Image, |
| PageBreak, |
| Paragraph, |
| SimpleDocTemplate, |
| Spacer, |
| Table, |
| TableStyle, |
| ) |
| from reportlab.pdfbase.ttfonts import TTFont |
| from reportlab.pdfbase import pdfmetrics |
| from PIL import Image as PILImage |
|
|
| ROOT = Path(__file__).resolve().parent |
| OUT = ROOT / "solution_description.pdf" |
| ASSET = ROOT / "assets" |
|
|
| try: |
| pdfmetrics.registerFont(TTFont("HelveticaNeue", "/System/Library/Fonts/Helvetica.ttc")) |
| except Exception: |
| pass |
|
|
| PAGE_W, PAGE_H = A4 |
| MARGIN = 1.55 * cm |
| ACCENT = colors.HexColor("#A54F2B") |
| INK = colors.HexColor("#211C17") |
| MUTED = colors.HexColor("#6F665B") |
| LINE = colors.HexColor("#D8CAB8") |
| PAPER = colors.HexColor("#FFF9EF") |
|
|
| styles = getSampleStyleSheet() |
| styles.add(ParagraphStyle( |
| name="TitleBig", parent=styles["Title"], fontName="Helvetica-Bold", |
| fontSize=24, leading=28, textColor=INK, spaceAfter=8, alignment=TA_LEFT, |
| )) |
| styles.add(ParagraphStyle( |
| name="Sub", parent=styles["BodyText"], fontName="Helvetica", fontSize=10.5, |
| leading=14, textColor=MUTED, spaceAfter=14, |
| )) |
| styles.add(ParagraphStyle( |
| name="H", parent=styles["Heading2"], fontName="Helvetica-Bold", fontSize=14.5, |
| leading=18, textColor=ACCENT, spaceBefore=10, spaceAfter=7, |
| )) |
| styles.add(ParagraphStyle( |
| name="Body", parent=styles["BodyText"], fontName="Helvetica", fontSize=9.7, |
| leading=13.2, textColor=INK, spaceAfter=7, |
| )) |
| styles.add(ParagraphStyle( |
| name="Small", parent=styles["BodyText"], fontName="Helvetica", fontSize=8.2, |
| leading=10.5, textColor=MUTED, spaceAfter=4, |
| )) |
| styles.add(ParagraphStyle( |
| name="MyBullet", parent=styles["BodyText"], fontName="Helvetica", fontSize=9.4, |
| leading=12.5, textColor=INK, leftIndent=12, bulletIndent=3, spaceAfter=4, |
| )) |
|
|
|
|
| def p(text: str, style="Body"): |
| return Paragraph(text, styles[style]) |
|
|
|
|
| def bullet(text: str): |
| return Paragraph(text, styles["MyBullet"], bulletText="-") |
|
|
|
|
| def img_flowable(path: Path, max_w: float, max_h: float): |
| im = PILImage.open(path) |
| w, h = im.size |
| scale = min(max_w / w, max_h / h) |
| return Image(str(path), width=w * scale, height=h * scale) |
|
|
|
|
| def page_footer(canvas, doc): |
| canvas.saveState() |
| canvas.setStrokeColor(LINE) |
| canvas.line(MARGIN, 1.05 * cm, PAGE_W - MARGIN, 1.05 * cm) |
| canvas.setFont("Helvetica", 8) |
| canvas.setFillColor(MUTED) |
| canvas.drawString(MARGIN, 0.68 * cm, "PortraitCraft Track 2 Solution Description") |
| canvas.drawRightString(PAGE_W - MARGIN, 0.68 * cm, f"{doc.page}") |
| canvas.restoreState() |
|
|
|
|
| def callout(title: str, body: str): |
| tbl = Table([[p(f"<b>{title}</b><br/>{body}", "Body")]], colWidths=[PAGE_W - 2 * MARGIN]) |
| tbl.setStyle(TableStyle([ |
| ("BACKGROUND", (0, 0), (-1, -1), PAPER), |
| ("BOX", (0, 0), (-1, -1), 0.8, LINE), |
| ("LEFTPADDING", (0, 0), (-1, -1), 12), |
| ("RIGHTPADDING", (0, 0), (-1, -1), 12), |
| ("TOPPADDING", (0, 0), (-1, -1), 9), |
| ("BOTTOMPADDING", (0, 0), (-1, -1), 8), |
| ])) |
| return tbl |
|
|
|
|
| story = [] |
| story.append(p("PortraitCraft Track 2 Solution Description", "TitleBig")) |
| story.append(p("Track 2: Portrait Composition Generation", "Sub")) |
| story.append(callout( |
| "Core idea", |
| "We combine a portrait-composition fine-tuned Qwen-Image model with a prompt-conditioned adaptive canvas policy. The model improves visual quality and prompt-to-layout alignment, while the canvas policy selects a suitable aspect ratio before generation." |
| )) |
| story.append(Spacer(1, 0.25 * cm)) |
| story.append(p("1. Method Overview", "H")) |
| story.append(p("Our solution is built on Qwen-Image and focuses on improving portrait composition generation through two complementary components: a portrait-composition fine-tuned generation model and a prompt-conditioned adaptive canvas policy. The generation model synthesizes visually coherent portrait images, while the canvas policy selects the generation aspect ratio before sampling.")) |
| story.append(p("A fixed square canvas is not always appropriate for portrait composition. Close-up and centered portraits often work well on square canvases, full-body portraits benefit from vertical layouts, and environmental portraits or scenes with roads, coastlines, leading lines, or large negative space often need horizontal or wide canvases.")) |
|
|
| story.append(p("2. Training Data And Fine-Tuning", "H")) |
| story.append(p("We fine-tuned Qwen-Image using the official 4,500 PortraitCraft training samples together with an additional private portrait aesthetic-composition dataset curated by our team. The private data focuses on portrait layout, aesthetic framing, human-subject placement, environmental context, lighting balance, and composition consistency.")) |
| story.append(p("We compared LoRA fine-tuning and full-parameter fine-tuning under the same inference settings. Full-parameter fine-tuning was selected for the final submission because it performed better for this task, especially in aesthetic quality, composition stability, and prompt-to-layout alignment.")) |
|
|
| story.append(p("3. Adaptive Canvas Policy", "H")) |
| story.append(p("We do not use a fixed 1:1 canvas for all images. Instead, we use a prompt-conditioned adaptive canvas policy. The policy reads the input prompt and a learned policy state, then outputs a canvas size before image generation. The longer side is normalized to 1584 pixels, while the shorter side is selected from a compact set of portrait-friendly aspect ratios.")) |
| story.append(p("The policy was optimized on the training set through an iterative evolutionary-search procedure. The search adjusted keyword weights, decision thresholds, and candidate aspect-ratio choices. This lets the inference system preserve the intended spatial structure for different prompt types, including square portraits, full-body vertical portraits, and horizontal environmental portraits.")) |
| story.append(callout("Reproducibility", "We release the final learned policy state together with the inference code. Reviewers can recover the same canvas selection used by our submission. For unseen prompts, the implementation falls back to a deterministic prompt-only rule policy.")) |
|
|
| story.append(PageBreak()) |
| story.append(p("4. Inference Pipeline", "H")) |
| story.append(p("The final inference pipeline first applies the adaptive canvas selector to each prompt, then generates the image using the released PortraitCraft Track 2 checkpoint. The output images are saved with their original task filenames and packaged as a flat zip file.")) |
|
|
| pipeline = Table([ |
| [p("Input prompt", "Small"), p("Adaptive canvas policy", "Small"), p("Qwen-Image + PortraitCraft checkpoint", "Small"), p("Generated image", "Small")], |
| [p("Text description and composition analysis", "Small"), p("Select width and height, longest side 1584", "Small"), p("50 sampling steps, CFG 4.0, fixed seed", "Small"), p("Flat image output for submission", "Small")], |
| ], colWidths=[4.0*cm, 4.3*cm, 5.1*cm, 4.0*cm]) |
| pipeline.setStyle(TableStyle([ |
| ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#EFE2D0")), |
| ("BACKGROUND", (0, 1), (-1, 1), PAPER), |
| ("BOX", (0, 0), (-1, -1), 0.8, LINE), |
| ("INNERGRID", (0, 0), (-1, -1), 0.5, LINE), |
| ("VALIGN", (0, 0), (-1, -1), "TOP"), |
| ("LEFTPADDING", (0, 0), (-1, -1), 8), |
| ("RIGHTPADDING", (0, 0), (-1, -1), 8), |
| ("TOPPADDING", (0, 0), (-1, -1), 8), |
| ("BOTTOMPADDING", (0, 0), (-1, -1), 8), |
| ])) |
| story.append(pipeline) |
| story.append(Spacer(1, 0.35 * cm)) |
| story.append(p("5. Inference Configuration", "H")) |
| for item in [ |
| "Base model: Qwen-Image", |
| "Checkpoint: portraitcraft-track2.safetensors", |
| "Sampling steps: 50", |
| "CFG scale: 4.0", |
| "Seed: 346346", |
| "Adaptive canvas longest side: 1584 pixels", |
| ]: |
| story.append(bullet(item)) |
| story.append(p("The released GitHub repository contains inference scripts, the adaptive canvas policy implementation, the learned policy state, and submission packaging utilities. The model checkpoint is hosted on Hugging Face.")) |
|
|
| story.append(p("6. Reproducibility Links", "H")) |
| story.append(p("Code repository: https://github.com/w-Jessamine/portraitcraft-track2-solution")) |
| story.append(p("Model repository: https://huggingface.co/Jessamine/portraitcraft-track2")) |
|
|
| story.append(PageBreak()) |
| story.append(p("7. Qualitative Examples", "H")) |
| story.append(p("The following generated examples illustrate how the adaptive canvas policy supports different composition needs. Square layouts are used for centered portraits, vertical layouts preserve human-body framing and breathing room, and horizontal layouts support environmental context and directional visual flow.")) |
|
|
| examples = [ |
| ("Square portrait canvas", "square_portrait.jpg", "Centered or close portrait layouts benefit from a balanced 1:1 frame."), |
| ("Vertical portrait canvas", "vertical_portrait.jpg", "Vertical canvases preserve subject height, headroom, and full-body framing."), |
| ("Horizontal environmental portrait", "horizontal_environment.jpg", "Horizontal canvases help preserve scene context and lateral visual flow."), |
| ("Wide environmental composition", "wide_environment.jpg", "Wider layouts support roads, coastlines, leading lines, and negative space."), |
| ] |
| rows = [] |
| for title, file, caption in examples: |
| rows.append([[img_flowable(ASSET / file, 7.8*cm, 5.2*cm), Spacer(1, 0.08*cm), p(f"<b>{title}</b><br/>{caption}", "Small")]]) |
| |
| cells = [] |
| for i in range(0, len(rows), 2): |
| cells.append([rows[i][0], rows[i+1][0]]) |
| ex_table = Table(cells, colWidths=[8.45*cm, 8.45*cm]) |
| ex_table.setStyle(TableStyle([ |
| ("VALIGN", (0, 0), (-1, -1), "TOP"), |
| ("LEFTPADDING", (0, 0), (-1, -1), 4), |
| ("RIGHTPADDING", (0, 0), (-1, -1), 4), |
| ("TOPPADDING", (0, 0), (-1, -1), 4), |
| ("BOTTOMPADDING", (0, 0), (-1, -1), 10), |
| ])) |
| story.append(ex_table) |
|
|
| story.append(PageBreak()) |
| story.append(p("8. Summary", "H")) |
| story.append(p("Our final solution uses a full-parameter fine-tuned Qwen-Image checkpoint and a prompt-conditioned adaptive canvas policy. The training data combines the official PortraitCraft training set with private portrait aesthetic-composition data. The adaptive canvas policy was optimized on the training data and released with the code to make canvas selection reproducible.")) |
| story.append(p("This design improves portrait composition generation by matching the canvas to the prompt's spatial intent before image synthesis, rather than forcing every image into the same square frame.")) |
| story.append(Spacer(1, 0.4*cm)) |
| story.append(callout("Submitted artifacts", "The accompanying GitHub repository contains runnable inference code and policy files. The Hugging Face repository contains the final PortraitCraft Track 2 checkpoint and matching documentation.")) |
|
|
|
|
| doc = SimpleDocTemplate( |
| str(OUT), pagesize=A4, |
| leftMargin=MARGIN, rightMargin=MARGIN, |
| topMargin=1.45*cm, bottomMargin=1.35*cm, |
| title="PortraitCraft Track 2 Solution Description", |
| ) |
| doc.build(story, onFirstPage=page_footer, onLaterPages=page_footer) |
| print(OUT) |
|
|