DouDou commited on
Commit
1ad6225
·
verified ·
1 Parent(s): fe52d16

Upload data1/merge_dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/merge_dataset.py +97 -0
data1/merge_dataset.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import hashlib
4
+ from pathlib import Path
5
+ from tqdm import tqdm
6
+
7
+ # Read repos metadata
8
+ repos_df = pd.read_csv("workdir/repos_checked.csv")
9
+ repo_meta = repos_df.set_index("full_name")[["keyword", "license"]].to_dict("index")
10
+
11
+ # Process crawled repos
12
+ print("Processing crawled repos...")
13
+ crawl_rows = []
14
+ filtered_dir = Path("workdir/repos_filtered")
15
+
16
+ for repo_dir in tqdm(list(filtered_dir.iterdir()), desc="Reading filtered repos"):
17
+ if not repo_dir.is_dir() or repo_dir.name.startswith("."):
18
+ continue
19
+
20
+ full_name = repo_dir.name.replace("___", "/", 1)
21
+ meta = repo_meta.get(full_name, {"keyword": "", "license": ""})
22
+
23
+ for file_path in repo_dir.rglob("*"):
24
+ if not file_path.is_file():
25
+ continue
26
+
27
+ try:
28
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
29
+ text = f.read()
30
+
31
+ crawl_rows.append(
32
+ {
33
+ "text": text,
34
+ "repo_name": full_name,
35
+ "path": str(file_path.relative_to(repo_dir)),
36
+ "language": file_path.suffix.lstrip(".") or "unknown",
37
+ "license": meta["license"],
38
+ "size": len(text),
39
+ "keyword": meta["keyword"],
40
+ "text_hash": hashlib.sha256(text.encode()).hexdigest(),
41
+ "config": "",
42
+ "split": "",
43
+ "repo_path": "",
44
+ "ds_source": "crawl",
45
+ }
46
+ )
47
+ except Exception as e:
48
+ print(f"Error reading {file_path}: {e}")
49
+
50
+ crawl_df = pd.DataFrame(crawl_rows)
51
+
52
+ # Load chempile data
53
+ print("\nLoading chempile data...")
54
+ chempile_files = sorted(Path("./datasets/all_chempile_code").glob("chempile_code_complete_*.csv"))
55
+ chempile_df = pd.concat([pd.read_csv(f) for f in tqdm(chempile_files)], ignore_index=True)
56
+ chempile_df["ds_source"] = "chempile"
57
+
58
+ # Merge and compute unified text_hash for all rows
59
+ print("\nMerging datasets...")
60
+ merged_df = pd.concat([chempile_df, crawl_df], ignore_index=True)
61
+ original_count = len(merged_df)
62
+
63
+ # Compute text_hash for all rows (unified hash)
64
+ print("Computing unified text_hash for all rows...")
65
+ merged_df["text_hash"] = merged_df["text"].apply(lambda x: hashlib.sha1(str(x).encode()).hexdigest())
66
+
67
+ # Deduplicate by text_hash
68
+ print("Deduplicating by text_hash...")
69
+ merged_df = merged_df.drop_duplicates(subset=["text_hash"], keep="first")
70
+
71
+ # Save in 500MB chunks
72
+ print("\nSaving in 500MB chunks...")
73
+ merged_data_dir = "./datasets/data_merged"
74
+ os.makedirs(merged_data_dir, exist_ok=True)
75
+ merged_df.to_csv(f"{merged_data_dir}/dataset_all.csv")
76
+ MAX_SIZE_MB = 500
77
+ chunk_num = 1
78
+ rows_per_chunk = 50000
79
+ start_idx = 0
80
+
81
+ while start_idx < len(merged_df):
82
+ end_idx = min(start_idx + rows_per_chunk, len(merged_df))
83
+ chunk_df = merged_df.iloc[start_idx:end_idx]
84
+
85
+ output_path = f"{merged_data_dir}/{chunk_num:03d}.csv"
86
+ chunk_df.to_csv(output_path, index=False)
87
+ size_mb = os.path.getsize(output_path) / (1024 * 1024)
88
+
89
+ if size_mb > 0:
90
+ rows_per_chunk = int(rows_per_chunk * (MAX_SIZE_MB / size_mb) * 0.95)
91
+
92
+ print(f"Saved {output_path}: {size_mb:.1f}MB, {len(chunk_df):,} rows")
93
+ start_idx = end_idx
94
+ chunk_num += 1
95
+
96
+ print(f"\nTotal: {len(merged_df):,} rows ({len(crawl_df):,} crawl + {len(chempile_df):,} chempile)")
97
+ print(f"Deduplicated: {len(chempile_df) + len(crawl_df) - len(merged_df):,} rows removed")