Spaces:

keysun89
/

Diff_Hindi

Runtime error

App Files Files Community

keysun89 commited on 11 days ago

Commit

4336dcc

verified ·

1 Parent(s): c1fabeb

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -113

app.py CHANGED Viewed

@@ -13,76 +13,59 @@ from unet import UNetModel
 from feature_extractor import Mixed_Encoder
 # ==========================================
-# 1. SETUP & DEVICE CONFIG
 # ==========================================
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-HINDI_VOCAB = [
-    "अ", "आ", "इ", "ई", "उ", "ऊ", "ऋ", "ए", "ऐ", "ओ", "औ",
-    "क", "ख", "ग", "घ", "ङ", "च", "छ", "ज", "झ", "ञ",
-    "ट", "ठ", "ड", "ढ", "ण", "त", "थ", "द", "ध", "न",
-    "प", "फ", "ब", "भ", "म", "य", "र", "ल", "व", "श",
-    "ष", "स", "ह"
-]
 # ==========================================
-# 2. INITIALIZE MODELS (Exact Architectural Match)
 # ==========================================
-print(f"🚀 Booting DiffusionPen on {DEVICE}...")
-# A. Style Encoder
-style_encoder = Mixed_Encoder(model_name='mobilenetv2_100', num_classes=300).to(DEVICE)
-style_weights = torch.load("mixed_hindi_mobilenetv2_100.pth", map_location=DEVICE)
-clean_style_dict = OrderedDict([(k.replace("module.", ""), v) for k, v in style_weights.items()])
-style_encoder.load_state_dict(clean_style_dict)
-style_encoder.eval()
-# B. Text Encoder (Canine)
-tokenizer = CanineTokenizer.from_pretrained("google/canine-c")
-text_encoder = CanineModel.from_pretrained("google/canine-c").to(DEVICE)
-# C. VAE
-vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse").to(DEVICE)
-# D. UNet (Matched to your NLTM training)
-unet = UNetModel(
-    image_size=(64, 256),
-    in_channels=4,
-    model_channels=320,
-    out_channels=4,
-    num_res_blocks=1,
-    attention_resolutions=[4, 2, 1],
-    channel_mult=[1, 1, 1, 1],
-    context_dim=320
-).to(DEVICE)
-# E. Super-Loader for ema_ckpt.pt
-full_checkpoint = torch.load("ema_ckpt.pt", map_location=DEVICE)
-clean_unet_dict = OrderedDict()
-clean_text_dict = OrderedDict()
-for k, v in full_checkpoint.items():
-    clean_key = k.replace("module.", "")
-    if "text_encoder." in clean_key:
-        clean_text_dict[clean_key.split("text_encoder.")[-1]] = v
-    else:
-        clean_unet_dict[clean_key] = v
-unet.load_state_dict(clean_unet_dict, strict=False)
-try:
-    text_encoder.load_state_dict(clean_text_dict, strict=False)
-except:
-    pass # Fallback to base Canine if keys mismatch
-unet.eval()
-text_encoder.eval()
-scheduler = DDPMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
 # ==========================================
 # 3. INFERENCE ENGINE
 # ==========================================
-style_transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
@@ -90,63 +73,49 @@ style_transform = transforms.Compose([
 def predict(hindi_text, s1, s2):
     if not hindi_text: return None
-    with torch.no_grad():
-        # 1. Two-Shot Style Averaging
-        style_inputs = [img for img in [s1, s2] if img is not None]
-        if not style_inputs: return None
-        all_style_vectors = []
-        for img in style_inputs:
-            img_t = style_transform(img).unsqueeze(0).to(DEVICE)
-            _, feat = style_encoder(img_t)
-            all_style_vectors.append(feat)
-        final_style_vec = torch.mean(torch.stack(all_style_vectors), dim=0)
-        # 2. Text Conditioning (The "Dictionary Bug" Fix)
-        text_inputs = tokenizer(hindi_text, padding="max_length", max_length=128, return_tensors="pt").to(DEVICE)
-        # We pass the encoded hidden state to the UNet
-        context = text_encoder(**text_inputs).last_hidden_state
-        # 3. Diffusion Sampling (The "CPU Speed" Fix)
-        latents = torch.randn((1, 4, 8, 32)).to(DEVICE)
-        # ⚠️ REDUCED TO 10 STEPS for CPU stability. Change to 50 if you have a GPU.
-        scheduler.set_timesteps(10)
-        for t in scheduler.timesteps:
-            # We bypass the internal text_encoder call in unet.py and pass pre-computed context
-            noise_pred = unet(latents, t.unsqueeze(0).to(DEVICE), context=context, style_extractor=final_style_vec)
-            latents = scheduler.step(noise_pred, t, latents).prev_sample
-        # 4. Decode
-        latents = 1 / 0.18215 * latents
-        image = vae.decode(latents).sample
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
-        return Image.fromarray((image * 255).astype(np.uint8))
 # ==========================================
-# 4. GRADIO UI (2-Style Layout)
 # ==========================================
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🖋️ DiffusionPen: 2-Shot Hindi Style Transfer")
-    gr.Markdown("### Developed by Kishan Madlani | NIT Surat")
     with gr.Row():
         with gr.Column():
-            text_box = gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते...")
-            gr.Markdown("#### 📷 Upload 2 Style Samples")
-            with gr.Row():
-                img1 = gr.Image(type="pil", label="Sample 1 (Crop to few words)")
-                img2 = gr.Image(type="pil", label="Sample 2 (Crop to few words)")
-            btn = gr.Button("Generate Handwriting", variant="primary")
-        with gr.Column():
-            result_view = gr.Image(label="Output")
-            gr.Markdown("**Note:** Using 10 inference steps for real-time CPU performance.")
-    btn.click(fn=predict, inputs=[text_box, img1, img2], outputs=result_view)
 demo.launch()

 from feature_extractor import Mixed_Encoder
 # ==========================================
+# 1. SETUP
 # ==========================================
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ==========================================
+# 2. MODEL LOADING (With Error Catching)
 # ==========================================
+print(f"📦 Loading Super-Checkpoint on {DEVICE}...")
+try:
+    # A. VAE
+    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse").to(DEVICE)
+    # B. Style Encoder
+    style_encoder = Mixed_Encoder(model_name='mobilenetv2_100', num_classes=300).to(DEVICE)
+    s_weights = torch.load("mixed_hindi_mobilenetv2_100.pth", map_location=DEVICE)
+    style_encoder.load_state_dict(OrderedDict([(k.replace("module.", ""), v) for k, v in s_weights.items()]))
+    style_encoder.eval()
+    # C. Text Encoder & Tokenizer
+    tokenizer = CanineTokenizer.from_pretrained("google/canine-c")
+    text_encoder = CanineModel.from_pretrained("google/canine-c").to(DEVICE)
+    # D. UNet (1 ResBlock, 320 Context)
+    unet = UNetModel(
+        image_size=(64, 256), in_channels=4, model_channels=320, out_channels=4,
+        num_res_blocks=1, attention_resolutions=[4, 2, 1], channel_mult=[1, 1, 1, 1], context_dim=320
+    ).to(DEVICE)
+    # E. Super-Loader for ema_ckpt.pt
+    ckpt = torch.load("ema_ckpt.pt", map_location=DEVICE)
+    u_dict, t_dict = OrderedDict(), OrderedDict()
+    for k, v in ckpt.items():
+        k = k.replace("module.", "")
+        if "text_encoder." in k: t_dict[k.split("text_encoder.")[-1]] = v
+        else: u_dict[k] = v
+    unet.load_state_dict(u_dict, strict=False)
+    try: text_encoder.load_state_dict(t_dict, strict=False)
+    except: print("⚠️ Using base Canine weights.")
+    unet.eval()
+    text_encoder.eval()
+    scheduler = DDPMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+    print("✅ All models loaded perfectly!")
+except Exception as e:
+    print(f"❌ CRITICAL LOAD ERROR: {e}")
 # ==========================================
 # 3. INFERENCE ENGINE
 # ==========================================
+st_trans = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 def predict(hindi_text, s1, s2):
     if not hindi_text: return None
+    try:
+        with torch.no_grad():
+            # 1. Style
+            imgs = [i for i in [s1, s2] if i is not None]
+            if not imgs: return None
+            feats = [style_encoder(st_trans(i).unsqueeze(0).to(DEVICE))[1] for i in imgs]
+            style_vec = torch.mean(torch.stack(feats), dim=0)
+            # 2. Text (FIXED: We pass raw IDs so unet.py can handle the encoding)
+            t_in = tokenizer(hindi_text, padding="max_length", max_length=128, return_tensors="pt").to(DEVICE)
+            # 3. Diffusion (10 steps for CPU speed)
+            latents = torch.randn((1, 4, 8, 32)).to(DEVICE)
+            scheduler.set_timesteps(10)
+            for t in scheduler.timesteps:
+                # IMPORTANT: Pass the dictionary t_in, NOT a pre-computed tensor
+                # This matches your unet.py logic: context = self.text_encoder(**context)
+                noise_pred = unet(latents, t.unsqueeze(0).to(DEVICE), context=t_in, style_extractor=style_vector)
+                latents = scheduler.step(noise_pred, t, latents).prev_sample
+            # 4. Decode
+            latents = 1 / 0.18215 * latents
+            img = vae.decode(latents).sample
+            img = (img / 2 + 0.5).clamp(0, 1).cpu().permute(0, 2, 3, 1).numpy()[0]
+            return Image.fromarray((img * 255).astype(np.uint8))
+    except Exception as e:
+        print(f"❌ RUNTIME ERROR: {e}")
+        return None
 # ==========================================
+# 4. UI
 # ==========================================
+with gr.Blocks() as demo:
+    gr.Markdown("# 🖋️ DiffusionPen (NIT Surat)")
     with gr.Row():
         with gr.Column():
+            txt = gr.Textbox(label="Hindi Text")
+            im1 = gr.Image(type="pil", label="Style 1")
+            im2 = gr.Image(type="pil", label="Style 2")
+            btn = gr.Button("Generate")
+        out = gr.Image(label="Result")
+    btn.click(predict, [txt, im1, im2], out)
 demo.launch()