Spaces:

fast-stager
/

data-collection

Sleeping

App Files Files Community

Nightfury16 commited on 24 days ago

Commit

3c1b983

1 Parent(s): ace880e

update app.py

Browse files

Files changed (1) hide show

app.py +46 -88

app.py CHANGED Viewed

@@ -33,15 +33,8 @@ def sync_pull():
     token = HF_TOKEN if HF_TOKEN and len(HF_TOKEN) > 5 else None
     for filename in ["annotations.csv", "verifications.csv", "skipped.csv"]:
         try:
-            hf_hub_download(
-                repo_id=DATASET_REPO_ID,
-                filename=filename,
-                repo_type="dataset",
-                local_dir=CACHE_DIR,
-                token=token
-            )
-        except:
-            pass
 def sync_push_background(local_path, remote_filename):
     token = HF_TOKEN if HF_TOKEN and len(HF_TOKEN) > 5 else None
@@ -49,26 +42,17 @@ def sync_push_background(local_path, remote_filename):
     def _push():
         try:
             api = HfApi(token=token)
-            api.upload_file(
-                path_or_fileobj=local_path,
-                path_in_repo=remote_filename,
-                repo_id=DATASET_REPO_ID,
-                repo_type="dataset"
-            )
-        except:
-            pass
     threading.Thread(target=_push).start()
 def init_files():
     sync_pull()
     for f in [LABEL_FILE, VERIFY_FILE, SKIP_FILE]:
         if not os.path.exists(f):
-            if f == LABEL_FILE:
-                cols = ["timestamp", "user", "group_id", "url", "score", "label"]
-            elif f == VERIFY_FILE:
-                cols = ["timestamp", "user", "group_id", "url", "is_correct", "corrected_label", "corrected_score"]
-            else:
-                cols = ["timestamp", "user", "group_id"]
             pd.DataFrame(columns=cols).to_csv(f, index=False)
 init_files()
@@ -80,15 +64,8 @@ def load_all_urls():
         with open(URL_FILE, 'r') as f:
             data = json.load(f)
             if "groups" in data:
-                for group in data["groups"]:
-                    urls.extend(group.get("images", []))
-            elif isinstance(data, dict):
-                for rows in data.values():
-                    if isinstance(rows, list):
-                        for row in rows:
-                            if "unstaged_images" in row: urls.append(row["unstaged_images"])
-    except:
-        pass
     return urls
 def get_ordered_groups():
@@ -106,6 +83,8 @@ def get_flagged_groups():
     if not os.path.exists(LABEL_FILE): return []
     try:
         df = pd.read_csv(LABEL_FILE)
         errors = df[(df['score'] == 10) & (df['label'] != 'living_room')]
         return errors['group_id'].unique().tolist()
     except: return []
@@ -118,61 +97,51 @@ def get_saved_values(gid, mode):
     try:
         fname = LABEL_FILE if mode in ["label", "fix"] else VERIFY_FILE
         df = pd.read_csv(fname)
         rows = df[df['group_id'] == gid]
         for _, row in rows.iterrows():
             if mode in ["label", "fix"]:
                 saved_data[row['url']] = {"score": row['score'], "label": row['label']}
             else:
-                saved_data[row['url']] = {
-                    "is_correct": row['is_correct'],
-                    "label": row['corrected_label'],
-                    "score": row['corrected_score']
-                }
     except: pass
     return saved_data
 def get_stats_text():
     all_gids = get_ordered_groups()
     flagged = get_flagged_groups()
-    try: l = len(pd.read_csv(LABEL_FILE)['group_id'].unique())
     except: l = 0
-    try: v = len(pd.read_csv(VERIFY_FILE)['group_id'].unique())
-    except: v = 0
-    err_msg = f" | ⚠️ **Fix:** {len(flagged)}" if flagged else ""
-    return f"**Total:** {len(all_gids)} | **Labeled:** {l} | **Verified:** {v}{err_msg}"
 def render_workspace(mode, history, specific_index=None, move_back=False):
     all_ordered = get_ordered_groups()
     flagged_pool = get_flagged_groups()
-    if mode == "fix" and specific_index is None:
-        target_pool = flagged_pool
-        if not target_pool:
-            return {screen_menu: gr.update(visible=True), screen_work: gr.update(visible=False), log_box: "No errors left."}
-    else:
-        target_pool = all_ordered
     target_gid = None
     if specific_index is not None:
         if 0 <= specific_index < len(all_ordered): target_gid = all_ordered[specific_index]
-        else: return {log_box: "Out of range"}
     elif move_back and len(history) > 1:
         history.pop()
         target_gid = history[-1]
     else:
-        try:
-            l_done = set(pd.read_csv(LABEL_FILE)['group_id'].unique())
-            v_done = set(pd.read_csv(VERIFY_FILE)['group_id'].unique())
         except: l_done, v_done = set(), set()
-        candidates = []
-        for g in target_pool:
-            if mode == "label" and g not in l_done: candidates.append(g)
-            elif mode == "verify" and g in l_done and g not in v_done: candidates.append(g)
-            elif mode == "fix": candidates.append(g)
         if not candidates:
-             return {screen_menu: gr.update(visible=True), screen_work: gr.update(visible=False), log_box: "Finished."}
         target_gid = candidates[0]
     urls = get_group_urls(target_gid)
@@ -191,8 +160,7 @@ def render_workspace(mode, history, specific_index=None, move_back=False):
         processed_images = list(executor.map(fetch, urls))
     updates = {
-        screen_menu: gr.update(visible=False),
-        screen_work: gr.update(visible=True),
         header_md: f"# {mode.upper()} Property #{target_idx + 1} ({target_gid})",
         state_urls: urls, state_hist: history, state_idx: target_idx,
         top_stats: get_stats_text(), log_box: f"Loaded {target_gid}"
@@ -206,22 +174,19 @@ def render_workspace(mode, history, specific_index=None, move_back=False):
             updates[img_objs[i]] = gr.update(value=processed_images[i], visible=True)
             v_sc = saved_vals.get(u, {}).get('score', 5)
             v_lbl = saved_vals.get(u, {}).get('label', "living_room")
-            v_chk = saved_vals.get(u, {}).get('is_correct', True)
             is_err = (v_sc == 10 and v_lbl != "living_room")
-            err_txt = "<span style='color:red'>⚠️ Score 10 is Living Room only</span>" if is_err else ""
             if mode in ["label", "fix"]:
                 updates[c_sld] = gr.update(visible=True, value=v_sc, interactive=True)
                 updates[c_drp] = gr.update(visible=True, value=v_lbl, interactive=True)
                 updates[c_chk] = gr.update(visible=False)
-                updates[c_lbl] = gr.update(visible=True if is_err else False, value=err_txt)
             else:
                 p_lbl, p_sc = r1_vals.get(u, {}).get('label', "?"), r1_vals.get(u, {}).get('score', "?")
                 updates[c_sld] = gr.update(visible=True, value=v_sc if u in saved_vals else p_sc)
                 updates[c_drp] = gr.update(visible=True, value=v_lbl)
-                updates[c_chk] = gr.update(visible=True, value=v_chk)
-                updates[c_lbl] = gr.update(visible=True, value=f"Label: {p_lbl} ({p_sc})")
         else:
             updates[img_objs[i]] = gr.update(visible=False)
             for obj in [c_sld, c_drp, c_chk, c_lbl]: updates[obj] = gr.update(visible=False)
@@ -236,6 +201,7 @@ def save_data(mode, history, urls, *args):
         sc, lbl, chk = args[i*4], args[i*4+1], args[i*4+2]
         if mode in ["label", "fix"]: rows.append([ts, "user", gid, u, sc, lbl])
         else: rows.append([ts, "user", gid, u, chk, lbl, sc])
     fname = LABEL_FILE if mode in ["label", "fix"] else VERIFY_FILE
     with FileLock(LOCK_FILE):
         with open(fname, "a", newline="") as f: csv.writer(f).writerows(rows)
@@ -246,8 +212,10 @@ def refresh_cat():
     all_gids = get_ordered_groups()
     flagged = set(get_flagged_groups())
     try:
-        l_set = set(pd.read_csv(LABEL_FILE)['group_id'].unique())
-        v_set = set(pd.read_csv(VERIFY_FILE)['group_id'].unique())
     except: l_set, v_set = set(), set()
     data = []
     for i, gid in enumerate(all_gids):
@@ -258,20 +226,17 @@ def refresh_cat():
         data.append([i+1, s, gid])
     return pd.DataFrame(data, columns=["#", "Status", "ID"])
-with gr.Blocks(theme=gr.themes.Soft(), title="Labeler") as demo:
     state_mode, state_hist, state_urls, state_idx = gr.State("label"), gr.State([]), gr.State([]), gr.State(0)
     with gr.Row():
         top_stats = gr.Markdown("Loading...")
         btn_home = gr.Button("🏠 Home", size="sm", scale=0)
     with gr.Tabs():
         with gr.Tab("Workspace"):
             with gr.Group() as screen_menu:
                 gr.Markdown("# Welcome")
                 with gr.Row():
-                    b_start_l = gr.Button("Start Labeling", variant="primary")
-                    b_start_v = gr.Button("Start Verification")
-                    b_start_f = gr.Button("🛠 Fix Errors", variant="secondary")
             with gr.Group(visible=False) as screen_work:
                 header_md = gr.Markdown()
                 img_objs, input_objs = [], []
@@ -279,22 +244,15 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Labeler") as demo:
                     for i in range(MAX_IMAGES):
                         with gr.Column(min_width=200):
                             img = gr.Image(interactive=False, height=240)
-                            sld = gr.Slider(1, 10, step=1, label="Score")
-                            drp = gr.Dropdown(ROOM_CLASSES, label="Class")
-                            chk = gr.Checkbox(label="Correct?", value=True)
-                            lbl = gr.Markdown()
                             img_objs.append(img); input_objs.extend([sld, drp, chk, lbl])
                 with gr.Row():
-                    b_back = gr.Button("⬅ Back")
-                    b_save = gr.Button("💾 Save & Next", variant="primary")
                 log_box = gr.Textbox(label="Log", interactive=False)
         with gr.Tab("Catalog"):
             with gr.Row():
                 num_in = gr.Number(value=1, label="Prop #", precision=0)
-                b_go_l = gr.Button("Go (Label)")
-                b_go_v = gr.Button("Go (Verify)")
-                b_go_f = gr.Button("Go (Fix)")
             df_cat = gr.Dataframe(interactive=False)
             b_ref_cat = gr.Button("Refresh")
@@ -305,9 +263,9 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Labeler") as demo:
     b_save.click(save_data, [state_mode, state_hist, state_urls] + input_objs, ALL_IO)
     b_back.click(lambda m, h: render_workspace(m, h, move_back=True), [state_mode, state_hist], ALL_IO)
     btn_home.click(lambda: {screen_menu: gr.update(visible=True), screen_work: gr.update(visible=False), state_hist: []}, None, [screen_menu, screen_work, state_hist])
-    b_go_l.click(lambda: "label", None, state_mode).then(lambda n, m, h: render_workspace(m, h, specific_index=int(n)-1), [num_in, state_mode, state_hist], ALL_IO)
-    b_go_v.click(lambda: "verify", None, state_mode).then(lambda n, m, h: render_workspace(m, h, specific_index=int(n)-1), [num_in, state_mode, state_hist], ALL_IO)
-    b_go_f.click(lambda: "fix", None, state_mode).then(lambda n, m, h: render_workspace(m, h, specific_index=int(n)-1), [num_in, state_mode, state_hist], ALL_IO)
     b_ref_cat.click(refresh_cat, None, df_cat)
     demo.load(refresh_cat, None, df_cat).then(get_stats_text, None, top_stats)

     token = HF_TOKEN if HF_TOKEN and len(HF_TOKEN) > 5 else None
     for filename in ["annotations.csv", "verifications.csv", "skipped.csv"]:
         try:
+            hf_hub_download(repo_id=DATASET_REPO_ID, filename=filename, repo_type="dataset", local_dir=CACHE_DIR, token=token)
+        except: pass
 def sync_push_background(local_path, remote_filename):
     token = HF_TOKEN if HF_TOKEN and len(HF_TOKEN) > 5 else None
     def _push():
         try:
             api = HfApi(token=token)
+            api.upload_file(path_or_fileobj=local_path, path_in_repo=remote_filename, repo_id=DATASET_REPO_ID, repo_type="dataset")
+        except: pass
     threading.Thread(target=_push).start()
 def init_files():
     sync_pull()
     for f in [LABEL_FILE, VERIFY_FILE, SKIP_FILE]:
         if not os.path.exists(f):
+            cols = ["timestamp", "user", "group_id", "url", "score", "label"] if f == LABEL_FILE else \
+                   ["timestamp", "user", "group_id", "url", "is_correct", "corrected_label", "corrected_score"] if f == VERIFY_FILE else \
+                   ["timestamp", "user", "group_id"]
             pd.DataFrame(columns=cols).to_csv(f, index=False)
 init_files()
         with open(URL_FILE, 'r') as f:
             data = json.load(f)
             if "groups" in data:
+                for group in data["groups"]: urls.extend(group.get("images", []))
+    except: pass
     return urls
 def get_ordered_groups():
     if not os.path.exists(LABEL_FILE): return []
     try:
         df = pd.read_csv(LABEL_FILE)
+        if df.empty: return []
+        df = df.drop_duplicates(subset=['url'], keep='last')
         errors = df[(df['score'] == 10) & (df['label'] != 'living_room')]
         return errors['group_id'].unique().tolist()
     except: return []
     try:
         fname = LABEL_FILE if mode in ["label", "fix"] else VERIFY_FILE
         df = pd.read_csv(fname)
+        df = df.drop_duplicates(subset=['url'], keep='last')
         rows = df[df['group_id'] == gid]
         for _, row in rows.iterrows():
             if mode in ["label", "fix"]:
                 saved_data[row['url']] = {"score": row['score'], "label": row['label']}
             else:
+                saved_data[row['url']] = {"is_correct": row['is_correct'], "label": row['corrected_label'], "score": row['corrected_score']}
     except: pass
     return saved_data
 def get_stats_text():
     all_gids = get_ordered_groups()
     flagged = get_flagged_groups()
+    try:
+        df_l = pd.read_csv(LABEL_FILE).drop_duplicates(subset=['url'], keep='last')
+        l = len(df_l['group_id'].unique())
     except: l = 0
+    err_msg = f" | ⚠️ **Fix Needed:** {len(flagged)}" if flagged else ""
+    return f"**Properties:** {len(all_gids)} | **Labeled:** {l}{err_msg}"
 def render_workspace(mode, history, specific_index=None, move_back=False):
     all_ordered = get_ordered_groups()
     flagged_pool = get_flagged_groups()
     target_gid = None
     if specific_index is not None:
         if 0 <= specific_index < len(all_ordered): target_gid = all_ordered[specific_index]
     elif move_back and len(history) > 1:
         history.pop()
         target_gid = history[-1]
     else:
+        try:
+            df_l = pd.read_csv(LABEL_FILE).drop_duplicates(subset=['url'], keep='last')
+            l_done = set(df_l['group_id'].unique())
+            df_v = pd.read_csv(VERIFY_FILE).drop_duplicates(subset=['url'], keep='last')
+            v_done = set(df_v['group_id'].unique())
         except: l_done, v_done = set(), set()
+        if mode == "fix":
+            candidates = flagged_pool
+        else:
+            candidates = [g for g in all_ordered if (mode=="label" and g not in l_done) or (mode=="verify" and g in l_done and g not in v_done)]
         if not candidates:
+             return {screen_menu: gr.update(visible=True), screen_work: gr.update(visible=False), log_box: "Done!"}
         target_gid = candidates[0]
     urls = get_group_urls(target_gid)
         processed_images = list(executor.map(fetch, urls))
     updates = {
+        screen_menu: gr.update(visible=False), screen_work: gr.update(visible=True),
         header_md: f"# {mode.upper()} Property #{target_idx + 1} ({target_gid})",
         state_urls: urls, state_hist: history, state_idx: target_idx,
         top_stats: get_stats_text(), log_box: f"Loaded {target_gid}"
             updates[img_objs[i]] = gr.update(value=processed_images[i], visible=True)
             v_sc = saved_vals.get(u, {}).get('score', 5)
             v_lbl = saved_vals.get(u, {}).get('label', "living_room")
             is_err = (v_sc == 10 and v_lbl != "living_room")
             if mode in ["label", "fix"]:
                 updates[c_sld] = gr.update(visible=True, value=v_sc, interactive=True)
                 updates[c_drp] = gr.update(visible=True, value=v_lbl, interactive=True)
                 updates[c_chk] = gr.update(visible=False)
+                updates[c_lbl] = gr.update(visible=True if is_err else False, value="<span style='color:red'>⚠️ Score 10=Living Room only</span>")
             else:
                 p_lbl, p_sc = r1_vals.get(u, {}).get('label', "?"), r1_vals.get(u, {}).get('score', "?")
                 updates[c_sld] = gr.update(visible=True, value=v_sc if u in saved_vals else p_sc)
                 updates[c_drp] = gr.update(visible=True, value=v_lbl)
+                updates[c_chk] = gr.update(visible=True, value=saved_vals.get(u, {}).get('is_correct', True))
+                updates[c_lbl] = gr.update(visible=True, value=f"Prev: {p_lbl} ({p_sc})")
         else:
             updates[img_objs[i]] = gr.update(visible=False)
             for obj in [c_sld, c_drp, c_chk, c_lbl]: updates[obj] = gr.update(visible=False)
         sc, lbl, chk = args[i*4], args[i*4+1], args[i*4+2]
         if mode in ["label", "fix"]: rows.append([ts, "user", gid, u, sc, lbl])
         else: rows.append([ts, "user", gid, u, chk, lbl, sc])
     fname = LABEL_FILE if mode in ["label", "fix"] else VERIFY_FILE
     with FileLock(LOCK_FILE):
         with open(fname, "a", newline="") as f: csv.writer(f).writerows(rows)
     all_gids = get_ordered_groups()
     flagged = set(get_flagged_groups())
     try:
+        df_l = pd.read_csv(LABEL_FILE).drop_duplicates(subset=['url'], keep='last')
+        l_set = set(df_l['group_id'].unique())
+        df_v = pd.read_csv(VERIFY_FILE).drop_duplicates(subset=['url'], keep='last')
+        v_set = set(df_v['group_id'].unique())
     except: l_set, v_set = set(), set()
     data = []
     for i, gid in enumerate(all_gids):
         data.append([i+1, s, gid])
     return pd.DataFrame(data, columns=["#", "Status", "ID"])
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
     state_mode, state_hist, state_urls, state_idx = gr.State("label"), gr.State([]), gr.State([]), gr.State(0)
     with gr.Row():
         top_stats = gr.Markdown("Loading...")
         btn_home = gr.Button("🏠 Home", size="sm", scale=0)
     with gr.Tabs():
         with gr.Tab("Workspace"):
             with gr.Group() as screen_menu:
                 gr.Markdown("# Welcome")
                 with gr.Row():
+                    b_start_l, b_start_v, b_start_f = gr.Button("Label", variant="primary"), gr.Button("Verify"), gr.Button("🛠 Fix", variant="secondary")
             with gr.Group(visible=False) as screen_work:
                 header_md = gr.Markdown()
                 img_objs, input_objs = [], []
                     for i in range(MAX_IMAGES):
                         with gr.Column(min_width=200):
                             img = gr.Image(interactive=False, height=240)
+                            sld, drp, chk, lbl = gr.Slider(1, 10, step=1, label="Score"), gr.Dropdown(ROOM_CLASSES, label="Class"), gr.Checkbox(label="Correct?"), gr.Markdown()
                             img_objs.append(img); input_objs.extend([sld, drp, chk, lbl])
                 with gr.Row():
+                    b_back, b_save = gr.Button("⬅ Back"), gr.Button("💾 Save & Next", variant="primary")
                 log_box = gr.Textbox(label="Log", interactive=False)
         with gr.Tab("Catalog"):
             with gr.Row():
                 num_in = gr.Number(value=1, label="Prop #", precision=0)
+                b_go_l, b_go_v, b_go_f = gr.Button("Go Label"), gr.Button("Go Verify"), gr.Button("Go Fix")
             df_cat = gr.Dataframe(interactive=False)
             b_ref_cat = gr.Button("Refresh")
     b_save.click(save_data, [state_mode, state_hist, state_urls] + input_objs, ALL_IO)
     b_back.click(lambda m, h: render_workspace(m, h, move_back=True), [state_mode, state_hist], ALL_IO)
     btn_home.click(lambda: {screen_menu: gr.update(visible=True), screen_work: gr.update(visible=False), state_hist: []}, None, [screen_menu, screen_work, state_hist])
+    b_go_l.click(lambda: "label", None, state_mode).then(lambda n,m,h: render_workspace(m,h,int(n)-1), [num_in, state_mode, state_hist], ALL_IO)
+    b_go_v.click(lambda: "verify", None, state_mode).then(lambda n,m,h: render_workspace(m,h,int(n)-1), [num_in, state_mode, state_hist], ALL_IO)
+    b_go_f.click(lambda: "fix", None, state_mode).then(lambda n,m,h: render_workspace(m,h,int(n)-1), [num_in, state_mode, state_hist], ALL_IO)
     b_ref_cat.click(refresh_cat, None, df_cat)
     demo.load(refresh_cat, None, df_cat).then(get_stats_text, None, top_stats)