Spaces:

Fred808
/

PIL2

Paused

App Files Files Community

Fred808 commited on Jul 16, 2025

Commit

b5dfc9f

verified ·

1 Parent(s): 3d0c398

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -63

app.py CHANGED Viewed

@@ -1,78 +1,84 @@
-import io
 import os
 import torch
 from PIL import Image
-from fastapi import FastAPI, UploadFile, File
-from fastapi.responses import JSONResponse
 from transformers import AutoProcessor, AutoModelForCausalLM
-# Setup
-device = "cuda" if torch.cuda.is_available() else "cpu"
-app = FastAPI(title="Florence-2 Base Image Captioning API")
-# Load Florence-2 base model
-try:
-    vision_model = AutoModelForCausalLM.from_pretrained(
-        'microsoft/Florence-2-base',
-        trust_remote_code=True,
-        attn_implementation="eager"
-    ).to(device).eval()
-    vision_processor = AutoProcessor.from_pretrained(
-        'microsoft/Florence-2-base',
-        trust_remote_code=True
-    )
-except Exception as e:
-    vision_model = None
-    vision_processor = None
-    print(f"Model loading error: {e}")
-@app.post("/describe-image")
-async def describe_image(file: UploadFile = File(...)):
-    if vision_model is None or vision_processor is None:
-        return JSONResponse(status_code=500, content={"error": "Model not loaded"})
-    try:
-        contents = await file.read()
-        image = Image.open(io.BytesIO(contents)).convert("RGB")
-        # Preprocess
-        inputs = vision_processor(
-            text="<MORE_DETAILED_CAPTION>",
-            images=image,
-            return_tensors="pt"
-        ).to(device)
-        with torch.no_grad():
-            generated_ids = vision_model.generate(
-                input_ids=inputs["input_ids"],
-                pixel_values=inputs["pixel_values"],
-                max_new_tokens=1024,
-                num_beams=3,
-            )
-        generated_text = vision_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-        processed = vision_processor.post_process_generation(
-            generated_text,
-            task="<MORE_DETAILED_CAPTION>",
-            image_size=image.size
         )
-        caption = processed["<MORE_DETAILED_CAPTION>"]
-        return JSONResponse(content={
-            "filename": file.filename,
-            "description": caption
-        })
-    except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})
-@app.get("/")
-def root():
-    return {"message": "Florence-2 Base Image Captioning API is running"}
-# Run the app when executed directly
 if __name__ == "__main__":
-    import uvicorn
-    port = int(os.getenv("PORT", 7860))  # Spaces set PORT env var
-    uvicorn.run("app:app", host="0.0.0.0", port=port)

 import os
+import cv2
 import torch
+from pathlib import Path
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
+# ===== CONFIG =====
+VIDEO_PATH = "How.mp4"  # Set to your local video file
+FRAMES_DIR = "extracted"
+FPS = 3
+DEVICE = "cpu"  # Force CPU to avoid NCCL GPU issue
+# ===== Ensure Output Directory =====
+def ensure_dir(path):
+    Path(path).mkdir(parents=True, exist_ok=True)
+# ===== Frame Extraction Function =====
+def extract_frames(video_path, output_dir, fps=3):
+    ensure_dir(output_dir)
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        print(f"[ERROR] Failed to open video file: {video_path}")
+        return []
+    video_fps = cap.get(cv2.CAP_PROP_FPS)
+    if not video_fps or video_fps <= 0:
+        print("[WARN] Using fallback FPS: 30")
+        video_fps = 30
+    frame_interval = int(round(video_fps / fps))
+    frame_idx = 0
+    saved_idx = 1
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_paths = []
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_idx % frame_interval == 0:
+            frame_name = f"{saved_idx:04d}.png"
+            output_path = Path(output_dir) / frame_name
+            cv2.imwrite(str(output_path), frame)
+            frame_paths.append(str(output_path))
+            print(f"[INFO] Saved frame {frame_idx} -> {frame_name}")
+            saved_idx += 1
+        frame_idx += 1
+    cap.release()
+    return frame_paths
+# ===== Load Florence-2 Base Model =====
+print("[INFO] Loading Florence-2-base model on CPU")
+processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True, attn_implementation="eager").to(DEVICE).eval()
+# ===== Analyze a Frame =====
+def analyze_frame(image_path):
+    image = Image.open(image_path).convert("RGB")
+    inputs = processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            num_beams=3,
+            do_sample=False
         )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    result = processor.post_process_generation(
+        generated_text,
+        task="<MORE_DETAILED_CAPTION>",
+        image_size=(image.width, image.height)
+    )
+    return result["<MORE_DETAILED_CAPTION>"]
+# ===== Main Execution =====
 if __name__ == "__main__":
+    frame_list = extract_frames(VIDEO_PATH, FRAMES_DIR, FPS)
+    print(f"[INFO] {len(frame_list)} frames extracted.")
+    for idx, frame_path in enumerate(frame_list):
+        print(f"\n[FRAME {idx+1}] Analyzing: {frame_path}")
+        caption = analyze_frame(frame_path)
+        print(f"[RESULT] {caption}")