DouDou commited on
Commit
a51dbcc
·
verified ·
1 Parent(s): 15d17be

Upload data2/step22/function_req.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data2/step22/function_req.py +80 -0
data2/step22/function_req.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import jsonlines
3
+ import json
4
+
5
+ def get_function_scores(dir):
6
+ scores = []
7
+ subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
8
+ for subdir in subdirs:
9
+ md_path = os.path.join(dir, subdir, 'readme_summary.json')
10
+ md_score = 0
11
+ with open(md_path, 'r', encoding='utf-8', errors='ignore') as f:
12
+ md_score = json.load(f)['score']
13
+
14
+ json_path = os.path.join(dir, subdir, 'functions.jsonl')
15
+ contents = []
16
+ with jsonlines.open(json_path) as reader:
17
+ for obj in reader:
18
+ if 'score' in obj:
19
+ contents.append(obj['score'] * md_score)
20
+ scores.extend(contents)
21
+ return scores
22
+
23
+ # scores = sorted(get_function_scores('/home/weifengsun/tangou1/step2/step22/dataset'), reverse=True)
24
+ # print(len(scores))
25
+ # print(scores[:10])
26
+ # print(scores[-10:])
27
+ # print(scores[100000])
28
+ # print(scores[200000])
29
+ # print(scores[300000])
30
+ # print(scores[400000])
31
+ # print(scores[500000])
32
+ # 18099531
33
+ # [0.28443953109169584, 0.2844296876675756, 0.2825556445220201, 0.2806598113798131, 0.2768595256346984, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837]
34
+ # [-0.024879203269990824, -0.02555222846015559, -0.02579063325241293, -0.02583700829326574, -0.02583700829326574, -0.02587076373841679, -0.025900709478987816, -0.025951737689723586, -0.029216928089614402, -0.04466233910208084]
35
+ # 0.15702062139215656
36
+ # 0.14540986706855819
37
+ # 0.13808880121203515
38
+ # 0.13262306995012807
39
+ # 0.1282891692796717
40
+
41
+
42
+ def output_scores(dir, output_path, score):
43
+ subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
44
+ for subdir in subdirs:
45
+ md_path = os.path.join(dir, subdir, 'readme_summary.json')
46
+ md_summary = ''
47
+ md_score = 0
48
+ with open(md_path, 'r', encoding='utf-8', errors='ignore') as f:
49
+ data = json.load(f)
50
+ md_summary = data['readme_summary']
51
+ md_score = data['score']
52
+
53
+ json_path = os.path.join(dir, subdir, 'functions.jsonl')
54
+ contents = []
55
+ with jsonlines.open(json_path) as reader:
56
+ for obj in reader:
57
+ if 'score' in obj and obj['score'] * md_score > score:
58
+ obj['md_summary'] = md_summary
59
+ obj['md_score'] = md_score
60
+ obj['final_score'] = obj['score'] * md_score
61
+ with open(obj['file'], 'r', encoding='utf-8', errors='ignore') as f:
62
+ obj['code_content'] = ''.join(f.readlines()[obj['start_line']-1:obj['end_line']])
63
+ contents.append(obj)
64
+ with jsonlines.open(output_path, 'a', flush=True) as writer:
65
+ writer.write_all(contents)
66
+
67
+ output_scores('/home/weifengsun/tangou1/step2/step22/dataset', '/home/weifengsun/tangou1/step2/step22/output/function_filtered_scores.jsonl', 0.1282891692796717)
68
+ # with jsonlines.open('/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl', 'r') as reader:
69
+ # print(len(list(reader)))
70
+ # # 500000
71
+
72
+
73
+ # path = "/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl"
74
+ # size_bytes = os.path.getsize(path)
75
+
76
+ # size_mb = size_bytes / (1024 * 1024)
77
+ # size_gb = size_bytes / (1024 * 1024 * 1024)
78
+
79
+ # print(f"文件大小: {size_mb:.2f} MB")
80
+ # print(f"文件大小: {size_gb:.2f} GB")