Spaces:

Fred808
/

CU2

Paused

App Files Files Community

Fred808 commited on Jul 2, 2025

Commit

ba74f1a

verified ·

1 Parent(s): 3f391a4

Upload 12 files

Browse files

Files changed (12) hide show

annotations/README.txt +2 -0
app.py +23 -0
cursors/README.txt +2 -0
frames/README.txt +3 -0
requirements.txt +11 -0
scripts/blip2_infer.py +18 -0
scripts/cursor_tracker.py +38 -0
scripts/extract_frames.py +27 -0
scripts/pipeline.py +23 -0
scripts/test_vision_api.py +27 -0
scripts/vision_api.py +15 -0
videos/README.txt +2 -0

annotations/README.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Annotation outputs (JSON) will be saved here
2	+ # Example: cursor_positions.json

app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import json
+from scripts.extract_frames import extract_frames
+from scripts.cursor_tracker import track_cursor
+def run_pipeline():
+    video_dir = "videos"
+    frames_dir = "frames"
+    cursor_dir = "cursors"
+    annotations_dir = "annotations"
+    os.makedirs(frames_dir, exist_ok=True)
+    os.makedirs(cursor_dir, exist_ok=True)
+    os.makedirs(annotations_dir, exist_ok=True)
+    # Step 1: Extract frames
+    for video_file in os.listdir(video_dir):
+        if video_file.lower().endswith(('.mp4', '.avi', '.mov')):
+            extract_frames(os.path.join(video_dir, video_file), frames_dir)
+    # Step 2: Track cursor
+    track_cursor(frames_dir, cursor_dir, os.path.join(annotations_dir, "cursor_positions.json"))
+    print("Pipeline complete.")
+if __name__ == "__main__":
+    run_pipeline()

cursors/README.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Place your cursor template PNGs here
2	+ # Example: cursor_template.png

frames/README.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+# Example extracted frame
+# This folder will be filled by extract_frames.py
+# Example filename: sample_0001.png

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# Core ML and API dependencies
+opencv-python-headless>=4.8.0
+numpy>=1.24.0
+torch>=2.1.0
+transformers>=4.40.0
+pillow>=10.0.0
+# Optional: for API or annotation
+fastapi>=0.110.0
+uvicorn[standard]>=0.29.0
+requests>=2.31.0

scripts/blip2_infer.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from transformers import BlipProcessor, BlipForConditionalGeneration
+from PIL import Image
+import torch
+import os
+os.environ["HF_HOME"] = "/tmp/hf_cache"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
+model_id = "Salesforce/blip-image-captioning-large"
+processor = BlipProcessor.from_pretrained(model_id)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = BlipForConditionalGeneration.from_pretrained(model_id).to(device)
+def describe_image(image_path, prompt="Describe this image."):
+    image = Image.open(image_path).convert("RGB")
+    inputs = processor(image, prompt, return_tensors="pt").to(device)
+    output = model.generate(**inputs, max_new_tokens=100)
+    return processor.decode(output[0], skip_special_tokens=True)

scripts/cursor_tracker.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+import cv2
+import numpy as np
+from pathlib import Path
+def load_templates(cursor_dir):
+    templates = []
+    for fname in os.listdir(cursor_dir):
+        if fname.endswith('.png'):
+            templates.append(cv2.imread(os.path.join(cursor_dir, fname), cv2.IMREAD_UNCHANGED))
+    return templates
+def track_cursor(frames_dir, cursor_dir, output_json):
+    import json
+    templates = load_templates(cursor_dir)
+    results = []
+    for frame_file in sorted(os.listdir(frames_dir)):
+        if not frame_file.endswith('.png'):
+            continue
+        frame_path = os.path.join(frames_dir, frame_file)
+        frame = cv2.imread(frame_path)
+        best_match = None
+        best_val = -np.inf
+        for template in templates:
+            res = cv2.matchTemplate(frame, template[..., :3], cv2.TM_CCOEFF_NORMED)
+            min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
+            if max_val > best_val:
+                best_val = max_val
+                best_match = (max_loc, template.shape[:2], max_val)
+        if best_match:
+            (x, y), (h, w), score = best_match
+            results.append({"frame": frame_file, "cursor_pos": [int(x), int(y)], "score": float(score)})
+    with open(output_json, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"Cursor tracking complete. Results saved to {output_json}")
+if __name__ == "__main__":
+    track_cursor("frames", "cursors", "annotations/cursor_positions.json")

scripts/extract_frames.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+import cv2
+from pathlib import Path
+def extract_frames(video_path, output_dir, fps=2):
+    os.makedirs(output_dir, exist_ok=True)
+    vidcap = cv2.VideoCapture(video_path)
+    video_name = Path(video_path).stem
+    count = 0
+    frame_id = 1
+    success, image = vidcap.read()
+    while success:
+        if int(vidcap.get(cv2.CAP_PROP_POS_FRAMES)) % int(vidcap.get(cv2.CAP_PROP_FPS) // fps) == 0:
+            frame_name = f"{video_name}_{frame_id:04d}.png"
+            cv2.imwrite(os.path.join(output_dir, frame_name), image)
+            frame_id += 1
+        success, image = vidcap.read()
+        count += 1
+    vidcap.release()
+    print(f"Extracted {frame_id-1} frames from {video_path}")
+if __name__ == "__main__":
+    video_dir = "videos"
+    frames_dir = "frames"
+    for video_file in os.listdir(video_dir):
+        if video_file.lower().endswith(('.mp4', '.avi', '.mov')):
+            extract_frames(os.path.join(video_dir, video_file), frames_dir)

scripts/pipeline.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import json
+from scripts.extract_frames import extract_frames
+from scripts.cursor_tracker import track_cursor
+def run_pipeline():
+    video_dir = "videos"
+    frames_dir = "frames"
+    cursor_dir = "cursors"
+    annotations_dir = "annotations"
+    os.makedirs(frames_dir, exist_ok=True)
+    os.makedirs(cursor_dir, exist_ok=True)
+    os.makedirs(annotations_dir, exist_ok=True)
+    # Step 1: Extract frames
+    for video_file in os.listdir(video_dir):
+        if video_file.lower().endswith(('.mp4', '.avi', '.mov')):
+            extract_frames(os.path.join(video_dir, video_file), frames_dir)
+    # Step 2: Track cursor
+    track_cursor(frames_dir, cursor_dir, os.path.join(annotations_dir, "cursor_positions.json"))
+    print("Pipeline complete.")
+if __name__ == "__main__":
+    run_pipeline()

scripts/test_vision_api.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import requests
+import os
+def test_vision_api(image_path, prompt, api_url, output_json_path):
+    with open(image_path, "rb") as img_file:
+        files = {"image": img_file}
+        data = {"prompt": prompt}
+        response = requests.post(api_url, files=files, data=data)
+    try:
+        result = response.json()
+    except Exception as e:
+        print("Non-JSON response from API:")
+        print(response.text)
+        result = {"error": str(e), "raw_response": response.text}
+    with open(output_json_path, "w") as f:
+        import json
+        json.dump({"frame": os.path.basename(image_path), "result": result}, f, indent=2)
+    print(f"Logged response to {output_json_path}")
+if __name__ == "__main__":
+    # Example usage
+    test_vision_api(
+        image_path="frames/sample_task_0001.png",
+        prompt="Describe the user's task, app, actions, and the final goal.",
+        api_url="http://localhost:8000/vision",
+        output_json_path="annotations/test_vision_api_response.json"
+    )

scripts/vision_api.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+from fastapi import FastAPI, File, UploadFile, Form
+from scripts.blip2_infer import describe_image
+import shutil
+app = FastAPI()
+@app.post("/vision")
+async def process_vision(image: UploadFile = File(...), prompt: str = Form("Describe this image.")):
+    temp_path = "frame.jpg"
+    with open(temp_path, "wb") as f:
+        shutil.copyfileobj(image.file, f)
+    desc = describe_image(temp_path, prompt)
+    os.remove(temp_path)
+    return {"description": desc}

videos/README.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Place your input videos here (e.g., .mp4, .avi, .mov)
2	+ # Example: sample.mp4