DouDou commited on
Upload data2/step22/function_req.py with huggingface_hub
Browse files- data2/step22/function_req.py +80 -0
data2/step22/function_req.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import jsonlines
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
def get_function_scores(dir):
|
| 6 |
+
scores = []
|
| 7 |
+
subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
|
| 8 |
+
for subdir in subdirs:
|
| 9 |
+
md_path = os.path.join(dir, subdir, 'readme_summary.json')
|
| 10 |
+
md_score = 0
|
| 11 |
+
with open(md_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 12 |
+
md_score = json.load(f)['score']
|
| 13 |
+
|
| 14 |
+
json_path = os.path.join(dir, subdir, 'functions.jsonl')
|
| 15 |
+
contents = []
|
| 16 |
+
with jsonlines.open(json_path) as reader:
|
| 17 |
+
for obj in reader:
|
| 18 |
+
if 'score' in obj:
|
| 19 |
+
contents.append(obj['score'] * md_score)
|
| 20 |
+
scores.extend(contents)
|
| 21 |
+
return scores
|
| 22 |
+
|
| 23 |
+
# scores = sorted(get_function_scores('/home/weifengsun/tangou1/step2/step22/dataset'), reverse=True)
|
| 24 |
+
# print(len(scores))
|
| 25 |
+
# print(scores[:10])
|
| 26 |
+
# print(scores[-10:])
|
| 27 |
+
# print(scores[100000])
|
| 28 |
+
# print(scores[200000])
|
| 29 |
+
# print(scores[300000])
|
| 30 |
+
# print(scores[400000])
|
| 31 |
+
# print(scores[500000])
|
| 32 |
+
# 18099531
|
| 33 |
+
# [0.28443953109169584, 0.2844296876675756, 0.2825556445220201, 0.2806598113798131, 0.2768595256346984, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837]
|
| 34 |
+
# [-0.024879203269990824, -0.02555222846015559, -0.02579063325241293, -0.02583700829326574, -0.02583700829326574, -0.02587076373841679, -0.025900709478987816, -0.025951737689723586, -0.029216928089614402, -0.04466233910208084]
|
| 35 |
+
# 0.15702062139215656
|
| 36 |
+
# 0.14540986706855819
|
| 37 |
+
# 0.13808880121203515
|
| 38 |
+
# 0.13262306995012807
|
| 39 |
+
# 0.1282891692796717
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def output_scores(dir, output_path, score):
|
| 43 |
+
subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
|
| 44 |
+
for subdir in subdirs:
|
| 45 |
+
md_path = os.path.join(dir, subdir, 'readme_summary.json')
|
| 46 |
+
md_summary = ''
|
| 47 |
+
md_score = 0
|
| 48 |
+
with open(md_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 49 |
+
data = json.load(f)
|
| 50 |
+
md_summary = data['readme_summary']
|
| 51 |
+
md_score = data['score']
|
| 52 |
+
|
| 53 |
+
json_path = os.path.join(dir, subdir, 'functions.jsonl')
|
| 54 |
+
contents = []
|
| 55 |
+
with jsonlines.open(json_path) as reader:
|
| 56 |
+
for obj in reader:
|
| 57 |
+
if 'score' in obj and obj['score'] * md_score > score:
|
| 58 |
+
obj['md_summary'] = md_summary
|
| 59 |
+
obj['md_score'] = md_score
|
| 60 |
+
obj['final_score'] = obj['score'] * md_score
|
| 61 |
+
with open(obj['file'], 'r', encoding='utf-8', errors='ignore') as f:
|
| 62 |
+
obj['code_content'] = ''.join(f.readlines()[obj['start_line']-1:obj['end_line']])
|
| 63 |
+
contents.append(obj)
|
| 64 |
+
with jsonlines.open(output_path, 'a', flush=True) as writer:
|
| 65 |
+
writer.write_all(contents)
|
| 66 |
+
|
| 67 |
+
output_scores('/home/weifengsun/tangou1/step2/step22/dataset', '/home/weifengsun/tangou1/step2/step22/output/function_filtered_scores.jsonl', 0.1282891692796717)
|
| 68 |
+
# with jsonlines.open('/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl', 'r') as reader:
|
| 69 |
+
# print(len(list(reader)))
|
| 70 |
+
# # 500000
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# path = "/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl"
|
| 74 |
+
# size_bytes = os.path.getsize(path)
|
| 75 |
+
|
| 76 |
+
# size_mb = size_bytes / (1024 * 1024)
|
| 77 |
+
# size_gb = size_bytes / (1024 * 1024 * 1024)
|
| 78 |
+
|
| 79 |
+
# print(f"文件大小: {size_mb:.2f} MB")
|
| 80 |
+
# print(f"文件大小: {size_gb:.2f} GB")
|