DouDou commited on
Commit
b805898
·
verified ·
1 Parent(s): 5c31870

Upload data3/check_enhanced.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data3/check_enhanced.py +35 -0
data3/check_enhanced.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import csv
3
+ import json
4
+
5
+ print("Checking enhanced_dataset.csv...")
6
+ with open('enhanced_dataset.csv', 'r', encoding='utf-8') as f:
7
+ reader = csv.DictReader(f)
8
+
9
+ # Get first row
10
+ row = next(reader)
11
+ print(f"Columns: {list(row.keys())}")
12
+ print(f"\nFirst row values:")
13
+ print(f" Unnamed: 0: {row.get('Unnamed: 0', 'N/A')}")
14
+ print(f" Unnamed: 0.1: {row.get('Unnamed: 0.1', 'N/A')}")
15
+ print(f" repo_name: {row.get('repo_name', 'N/A')}")
16
+ print(f" path: {row.get('path', 'N/A')}")
17
+ print(f" language: {row.get('language', 'N/A')}")
18
+
19
+ # Try to find the row matching original_index=489788
20
+ print("\n\nSearching for original_index=489788...")
21
+ f.seek(0)
22
+ next(reader) # Skip header
23
+
24
+ for i, row in enumerate(reader):
25
+ # Check different potential index columns
26
+ idx_val = row.get('Unnamed: 0.1') or row.get('Unnamed: 0') or row.get('')
27
+ if idx_val == '489788':
28
+ print(f"Found at row {i+1}!")
29
+ print(f" repo_name: '{row.get('repo_name', 'N/A')}'")
30
+ print(f" path: '{row.get('path', 'N/A')}'")
31
+ print(f" language: '{row.get('language', 'N/A')}'")
32
+ break
33
+ if i >= 100000: # Don't search forever
34
+ print(f"Not found in first 100k rows")
35
+ break