DouDou commited on
Upload data1/merge_dataset.py with huggingface_hub
Browse files- data1/merge_dataset.py +97 -0
data1/merge_dataset.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import hashlib
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
# Read repos metadata
|
| 8 |
+
repos_df = pd.read_csv("workdir/repos_checked.csv")
|
| 9 |
+
repo_meta = repos_df.set_index("full_name")[["keyword", "license"]].to_dict("index")
|
| 10 |
+
|
| 11 |
+
# Process crawled repos
|
| 12 |
+
print("Processing crawled repos...")
|
| 13 |
+
crawl_rows = []
|
| 14 |
+
filtered_dir = Path("workdir/repos_filtered")
|
| 15 |
+
|
| 16 |
+
for repo_dir in tqdm(list(filtered_dir.iterdir()), desc="Reading filtered repos"):
|
| 17 |
+
if not repo_dir.is_dir() or repo_dir.name.startswith("."):
|
| 18 |
+
continue
|
| 19 |
+
|
| 20 |
+
full_name = repo_dir.name.replace("___", "/", 1)
|
| 21 |
+
meta = repo_meta.get(full_name, {"keyword": "", "license": ""})
|
| 22 |
+
|
| 23 |
+
for file_path in repo_dir.rglob("*"):
|
| 24 |
+
if not file_path.is_file():
|
| 25 |
+
continue
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 29 |
+
text = f.read()
|
| 30 |
+
|
| 31 |
+
crawl_rows.append(
|
| 32 |
+
{
|
| 33 |
+
"text": text,
|
| 34 |
+
"repo_name": full_name,
|
| 35 |
+
"path": str(file_path.relative_to(repo_dir)),
|
| 36 |
+
"language": file_path.suffix.lstrip(".") or "unknown",
|
| 37 |
+
"license": meta["license"],
|
| 38 |
+
"size": len(text),
|
| 39 |
+
"keyword": meta["keyword"],
|
| 40 |
+
"text_hash": hashlib.sha256(text.encode()).hexdigest(),
|
| 41 |
+
"config": "",
|
| 42 |
+
"split": "",
|
| 43 |
+
"repo_path": "",
|
| 44 |
+
"ds_source": "crawl",
|
| 45 |
+
}
|
| 46 |
+
)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Error reading {file_path}: {e}")
|
| 49 |
+
|
| 50 |
+
crawl_df = pd.DataFrame(crawl_rows)
|
| 51 |
+
|
| 52 |
+
# Load chempile data
|
| 53 |
+
print("\nLoading chempile data...")
|
| 54 |
+
chempile_files = sorted(Path("./datasets/all_chempile_code").glob("chempile_code_complete_*.csv"))
|
| 55 |
+
chempile_df = pd.concat([pd.read_csv(f) for f in tqdm(chempile_files)], ignore_index=True)
|
| 56 |
+
chempile_df["ds_source"] = "chempile"
|
| 57 |
+
|
| 58 |
+
# Merge and compute unified text_hash for all rows
|
| 59 |
+
print("\nMerging datasets...")
|
| 60 |
+
merged_df = pd.concat([chempile_df, crawl_df], ignore_index=True)
|
| 61 |
+
original_count = len(merged_df)
|
| 62 |
+
|
| 63 |
+
# Compute text_hash for all rows (unified hash)
|
| 64 |
+
print("Computing unified text_hash for all rows...")
|
| 65 |
+
merged_df["text_hash"] = merged_df["text"].apply(lambda x: hashlib.sha1(str(x).encode()).hexdigest())
|
| 66 |
+
|
| 67 |
+
# Deduplicate by text_hash
|
| 68 |
+
print("Deduplicating by text_hash...")
|
| 69 |
+
merged_df = merged_df.drop_duplicates(subset=["text_hash"], keep="first")
|
| 70 |
+
|
| 71 |
+
# Save in 500MB chunks
|
| 72 |
+
print("\nSaving in 500MB chunks...")
|
| 73 |
+
merged_data_dir = "./datasets/data_merged"
|
| 74 |
+
os.makedirs(merged_data_dir, exist_ok=True)
|
| 75 |
+
merged_df.to_csv(f"{merged_data_dir}/dataset_all.csv")
|
| 76 |
+
MAX_SIZE_MB = 500
|
| 77 |
+
chunk_num = 1
|
| 78 |
+
rows_per_chunk = 50000
|
| 79 |
+
start_idx = 0
|
| 80 |
+
|
| 81 |
+
while start_idx < len(merged_df):
|
| 82 |
+
end_idx = min(start_idx + rows_per_chunk, len(merged_df))
|
| 83 |
+
chunk_df = merged_df.iloc[start_idx:end_idx]
|
| 84 |
+
|
| 85 |
+
output_path = f"{merged_data_dir}/{chunk_num:03d}.csv"
|
| 86 |
+
chunk_df.to_csv(output_path, index=False)
|
| 87 |
+
size_mb = os.path.getsize(output_path) / (1024 * 1024)
|
| 88 |
+
|
| 89 |
+
if size_mb > 0:
|
| 90 |
+
rows_per_chunk = int(rows_per_chunk * (MAX_SIZE_MB / size_mb) * 0.95)
|
| 91 |
+
|
| 92 |
+
print(f"Saved {output_path}: {size_mb:.1f}MB, {len(chunk_df):,} rows")
|
| 93 |
+
start_idx = end_idx
|
| 94 |
+
chunk_num += 1
|
| 95 |
+
|
| 96 |
+
print(f"\nTotal: {len(merged_df):,} rows ({len(crawl_df):,} crawl + {len(chempile_df):,} chempile)")
|
| 97 |
+
print(f"Deduplicated: {len(chempile_df) + len(crawl_df) - len(merged_df):,} rows removed")
|