SciCode
/

dataset-builder

Model card Files Files and versions

xet

Community

DouDou commited on Feb 19

Commit

1ad6225

verified ·

1 Parent(s): fe52d16

Upload data1/merge_dataset.py with huggingface_hub

Browse files

Files changed (1) hide show

data1/merge_dataset.py +97 -0

data1/merge_dataset.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import pandas as pd
+import hashlib
+from pathlib import Path
+from tqdm import tqdm
+# Read repos metadata
+repos_df = pd.read_csv("workdir/repos_checked.csv")
+repo_meta = repos_df.set_index("full_name")[["keyword", "license"]].to_dict("index")
+# Process crawled repos
+print("Processing crawled repos...")
+crawl_rows = []
+filtered_dir = Path("workdir/repos_filtered")
+for repo_dir in tqdm(list(filtered_dir.iterdir()), desc="Reading filtered repos"):
+    if not repo_dir.is_dir() or repo_dir.name.startswith("."):
+        continue
+    full_name = repo_dir.name.replace("___", "/", 1)
+    meta = repo_meta.get(full_name, {"keyword": "", "license": ""})
+    for file_path in repo_dir.rglob("*"):
+        if not file_path.is_file():
+            continue
+        try:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                text = f.read()
+            crawl_rows.append(
+                {
+                    "text": text,
+                    "repo_name": full_name,
+                    "path": str(file_path.relative_to(repo_dir)),
+                    "language": file_path.suffix.lstrip(".") or "unknown",
+                    "license": meta["license"],
+                    "size": len(text),
+                    "keyword": meta["keyword"],
+                    "text_hash": hashlib.sha256(text.encode()).hexdigest(),
+                    "config": "",
+                    "split": "",
+                    "repo_path": "",
+                    "ds_source": "crawl",
+                }
+            )
+        except Exception as e:
+            print(f"Error reading {file_path}: {e}")
+crawl_df = pd.DataFrame(crawl_rows)
+# Load chempile data
+print("\nLoading chempile data...")
+chempile_files = sorted(Path("./datasets/all_chempile_code").glob("chempile_code_complete_*.csv"))
+chempile_df = pd.concat([pd.read_csv(f) for f in tqdm(chempile_files)], ignore_index=True)
+chempile_df["ds_source"] = "chempile"
+# Merge and compute unified text_hash for all rows
+print("\nMerging datasets...")
+merged_df = pd.concat([chempile_df, crawl_df], ignore_index=True)
+original_count = len(merged_df)
+# Compute text_hash for all rows (unified hash)
+print("Computing unified text_hash for all rows...")
+merged_df["text_hash"] = merged_df["text"].apply(lambda x: hashlib.sha1(str(x).encode()).hexdigest())
+# Deduplicate by text_hash
+print("Deduplicating by text_hash...")
+merged_df = merged_df.drop_duplicates(subset=["text_hash"], keep="first")
+# Save in 500MB chunks
+print("\nSaving in 500MB chunks...")
+merged_data_dir = "./datasets/data_merged"
+os.makedirs(merged_data_dir, exist_ok=True)
+merged_df.to_csv(f"{merged_data_dir}/dataset_all.csv")
+MAX_SIZE_MB = 500
+chunk_num = 1
+rows_per_chunk = 50000
+start_idx = 0
+while start_idx < len(merged_df):
+    end_idx = min(start_idx + rows_per_chunk, len(merged_df))
+    chunk_df = merged_df.iloc[start_idx:end_idx]
+    output_path = f"{merged_data_dir}/{chunk_num:03d}.csv"
+    chunk_df.to_csv(output_path, index=False)
+    size_mb = os.path.getsize(output_path) / (1024 * 1024)
+    if size_mb > 0:
+        rows_per_chunk = int(rows_per_chunk * (MAX_SIZE_MB / size_mb) * 0.95)
+    print(f"Saved {output_path}: {size_mb:.1f}MB, {len(chunk_df):,} rows")
+    start_idx = end_idx
+    chunk_num += 1
+print(f"\nTotal: {len(merged_df):,} rows ({len(crawl_df):,} crawl + {len(chempile_df):,} chempile)")
+print(f"Deduplicated: {len(chempile_df) + len(crawl_df) - len(merged_df):,} rows removed")