Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| from datetime import datetime, timedelta | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from huggingface_hub import hf_hub_download, list_repo_tree | |
| import config | |
| def load_raw_rewriting_as_pandas(): | |
| return load_dataset( | |
| config.HF_RAW_DATASET_NAME, split=config.HF_RAW_DATASET_SPLIT, token=config.HF_TOKEN, cache_dir=config.CACHE_DIR | |
| ).to_pandas() | |
| def load_full_commit_as_pandas(): | |
| return ( | |
| load_dataset( | |
| path=config.HF_FULL_COMMITS_DATASET_NAME, | |
| name=config.HF_FULL_COMMITS_DATASET_SUBNAME, | |
| split=config.HF_FULL_COMMITS_DATASET_SPLIT, | |
| cache_dir=config.CACHE_DIR, | |
| ) | |
| .to_pandas() | |
| .rename(columns={"message": "reference"}) | |
| ) | |
| def edit_time_from_history(history_str): | |
| history = json.loads(history_str) | |
| if len(history) == 0: | |
| return 0 | |
| timestamps = list(map(lambda e: datetime.fromisoformat(e["ts"]), history)) | |
| delta = max(timestamps) - min(timestamps) | |
| return delta // timedelta(milliseconds=1) | |
| def edit_time_from_timestamps(row): | |
| loaded_ts = datetime.fromisoformat(row["loaded_ts"]) | |
| submitted_ts = datetime.fromisoformat(row["submitted_ts"]) | |
| delta = submitted_ts - loaded_ts | |
| result = delta // timedelta(milliseconds=1) | |
| return result if result >= 0 else None | |
| def load_processed_rewriting_as_pandas(): | |
| manual_rewriting = load_raw_rewriting_as_pandas()[ | |
| [ | |
| "hash", | |
| "repo", | |
| "commit_msg_start", | |
| "commit_msg_end", | |
| "session", | |
| "commit_msg_history", | |
| "loaded_ts", | |
| "submitted_ts", | |
| ] | |
| ] | |
| manual_rewriting["edit_time_hist"] = manual_rewriting["commit_msg_history"].apply(edit_time_from_history) | |
| manual_rewriting["edit_time"] = manual_rewriting.apply(edit_time_from_timestamps, axis=1) | |
| manual_rewriting.drop(columns=["commit_msg_history", "loaded_ts", "submitted_ts"]) | |
| manual_rewriting.set_index(["hash", "repo"], inplace=True) | |
| mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]] | |
| mods_dataset.set_index(["hash", "repo"], inplace=True) | |
| return manual_rewriting.join(other=mods_dataset, how="left").reset_index() | |
| def load_synthetic_as_pandas(): | |
| return load_dataset( | |
| config.HF_SYNTHETIC_DATASET_NAME, | |
| "all_pairs_with_metrics", | |
| split=config.HF_SYNTHETIC_DATASET_SPLIT, | |
| token=config.HF_TOKEN, | |
| cache_dir=config.CACHE_DIR, | |
| ).to_pandas() | |
| def load_full_commit_with_predictions_as_pandas(): | |
| full_dataset = load_full_commit_as_pandas() | |
| predictions_paths = [] | |
| for prediction_file in list_repo_tree( | |
| repo_id=config.HF_PREDICTIONS_DATASET_NAME, | |
| path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL), | |
| repo_type="dataset", | |
| ): | |
| predictions_paths.append( | |
| hf_hub_download( | |
| prediction_file.path, | |
| repo_id=config.HF_PREDICTIONS_DATASET_NAME, | |
| repo_type="dataset", | |
| cache_dir=config.CACHE_DIR, | |
| ) | |
| ) | |
| dfs = [] | |
| for path in predictions_paths: | |
| dfs.append(pd.read_json(path, orient="records", lines=True)) | |
| predictions_dataset = pd.concat(dfs, axis=0, ignore_index=True) | |
| predictions_dataset = predictions_dataset.sample(frac=1, random_state=config.RANDOM_STATE).set_index( | |
| ["hash", "repo"] | |
| )[["prediction"]] | |
| predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep="first")] | |
| dataset = full_dataset.join(other=predictions_dataset, on=("hash", "repo")) | |
| return dataset.reset_index() | |