Spaces:
Runtime error
Runtime error
fix csv data saving + minor changes of names
Browse files- src/config.py +4 -2
- src/extract_questions.py +7 -5
- src/summarize.py +1 -1
src/config.py
CHANGED
|
@@ -13,8 +13,10 @@ class Config:
|
|
| 13 |
# wandb
|
| 14 |
project_name: str = "gradient_dissent_qabot"
|
| 15 |
yt_podcast_data_artifact: str = "gladiator/gradient_dissent_qabot/yt_podcast_transcript:latest"
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
config = Config()
|
|
|
|
| 13 |
# wandb
|
| 14 |
project_name: str = "gradient_dissent_qabot"
|
| 15 |
yt_podcast_data_artifact: str = "gladiator/gradient_dissent_qabot/yt_podcast_transcript:latest"
|
| 16 |
+
summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summarized_podcasts:latest"
|
| 17 |
+
summarized_que_data_artifact: str = (
|
| 18 |
+
"gladiator/gradient_dissent_bot/summarized_que_podcasts:latest"
|
| 19 |
+
)
|
| 20 |
|
| 21 |
|
| 22 |
config = Config()
|
src/extract_questions.py
CHANGED
|
@@ -16,11 +16,13 @@ import wandb
|
|
| 16 |
from config import config
|
| 17 |
|
| 18 |
|
| 19 |
-
def get_data(artifact_name: str =
|
| 20 |
podcast_artifact = wandb.use_artifact(artifact_name, type="dataset")
|
| 21 |
podcast_artifact_dir = podcast_artifact.download(config.root_data_dir)
|
| 22 |
filename = artifact_name.split(":")[0].split("/")[-1]
|
| 23 |
df = pd.read_csv(os.path.join(podcast_artifact_dir, f"{filename}.csv"))
|
|
|
|
|
|
|
| 24 |
return df
|
| 25 |
|
| 26 |
|
|
@@ -66,7 +68,6 @@ if __name__ == "__main__":
|
|
| 66 |
WandbTracer.init(
|
| 67 |
{
|
| 68 |
"project": "gradient_dissent_bot",
|
| 69 |
-
"name": "extract_questions",
|
| 70 |
"job_type": "extract_questions",
|
| 71 |
"config": asdict(config),
|
| 72 |
}
|
|
@@ -101,14 +102,15 @@ if __name__ == "__main__":
|
|
| 101 |
df["questions"] = questions
|
| 102 |
|
| 103 |
# log to wandb artifact
|
| 104 |
-
path_to_save = os.path.join(config.root_data_dir, "
|
| 105 |
df.to_csv(path_to_save, index=False)
|
| 106 |
-
artifact = wandb.Artifact("
|
| 107 |
artifact.add_file(path_to_save)
|
| 108 |
wandb.log_artifact(artifact)
|
| 109 |
|
| 110 |
# create wandb table
|
|
|
|
| 111 |
table = wandb.Table(dataframe=df)
|
| 112 |
-
wandb.log({"
|
| 113 |
|
| 114 |
WandbTracer.finish()
|
|
|
|
| 16 |
from config import config
|
| 17 |
|
| 18 |
|
| 19 |
+
def get_data(artifact_name: str, total_episodes: int = None):
|
| 20 |
podcast_artifact = wandb.use_artifact(artifact_name, type="dataset")
|
| 21 |
podcast_artifact_dir = podcast_artifact.download(config.root_data_dir)
|
| 22 |
filename = artifact_name.split(":")[0].split("/")[-1]
|
| 23 |
df = pd.read_csv(os.path.join(podcast_artifact_dir, f"{filename}.csv"))
|
| 24 |
+
if total_episodes is not None:
|
| 25 |
+
df = df.iloc[:total_episodes]
|
| 26 |
return df
|
| 27 |
|
| 28 |
|
|
|
|
| 68 |
WandbTracer.init(
|
| 69 |
{
|
| 70 |
"project": "gradient_dissent_bot",
|
|
|
|
| 71 |
"job_type": "extract_questions",
|
| 72 |
"config": asdict(config),
|
| 73 |
}
|
|
|
|
| 102 |
df["questions"] = questions
|
| 103 |
|
| 104 |
# log to wandb artifact
|
| 105 |
+
path_to_save = os.path.join(config.root_data_dir, "summarized_que_podcasts.csv")
|
| 106 |
df.to_csv(path_to_save, index=False)
|
| 107 |
+
artifact = wandb.Artifact("summarized_que_podcasts", type="dataset")
|
| 108 |
artifact.add_file(path_to_save)
|
| 109 |
wandb.log_artifact(artifact)
|
| 110 |
|
| 111 |
# create wandb table
|
| 112 |
+
df["questions"] = df["questions"].apply(lambda x: "\n".join(x))
|
| 113 |
table = wandb.Table(dataframe=df)
|
| 114 |
+
wandb.log({"summarized_que_podcasts": table})
|
| 115 |
|
| 116 |
WandbTracer.finish()
|
src/summarize.py
CHANGED
|
@@ -109,7 +109,7 @@ if __name__ == "__main__":
|
|
| 109 |
|
| 110 |
# save data
|
| 111 |
path_to_save = os.path.join(config.root_data_dir, "summarized_podcasts.csv")
|
| 112 |
-
df.to_csv(path_to_save)
|
| 113 |
|
| 114 |
# log to wandb artifact
|
| 115 |
artifact = wandb.Artifact("summarized_podcasts", type="dataset")
|
|
|
|
| 109 |
|
| 110 |
# save data
|
| 111 |
path_to_save = os.path.join(config.root_data_dir, "summarized_podcasts.csv")
|
| 112 |
+
df.to_csv(path_to_save, index=False)
|
| 113 |
|
| 114 |
# log to wandb artifact
|
| 115 |
artifact = wandb.Artifact("summarized_podcasts", type="dataset")
|