Spaces:
Runtime error
Runtime error
minor changes for new wandb project
Browse files- .gitignore +2 -1
- data/yt_podcast_transcript.csv +0 -0
- src/config.py +5 -3
- src/summarize.py +13 -14
.gitignore
CHANGED
|
@@ -161,4 +161,5 @@ cython_debug/
|
|
| 161 |
notebooks/
|
| 162 |
downloaded_data/
|
| 163 |
wandb/
|
| 164 |
-
.vscode/
|
|
|
|
|
|
| 161 |
notebooks/
|
| 162 |
downloaded_data/
|
| 163 |
wandb/
|
| 164 |
+
.vscode/
|
| 165 |
+
downloaded_artifacts/
|
data/yt_podcast_transcript.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/config.py
CHANGED
|
@@ -8,11 +8,13 @@ class Config:
|
|
| 8 |
|
| 9 |
# paths
|
| 10 |
root_data_dir: Path = Path("data")
|
|
|
|
|
|
|
| 11 |
# wandb
|
| 12 |
project_name: str = "gradient_dissent_qabot"
|
| 13 |
-
yt_podcast_data_artifact: str = "gladiator/
|
| 14 |
-
summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
|
| 15 |
-
summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
|
| 16 |
|
| 17 |
|
| 18 |
config = Config()
|
|
|
|
| 8 |
|
| 9 |
# paths
|
| 10 |
root_data_dir: Path = Path("data")
|
| 11 |
+
root_artifact_dir: Path = Path("downloaded_artifacts")
|
| 12 |
+
|
| 13 |
# wandb
|
| 14 |
project_name: str = "gradient_dissent_qabot"
|
| 15 |
+
yt_podcast_data_artifact: str = "gladiator/gradient_dissent_qabot/yt_podcast_transcript:latest"
|
| 16 |
+
# summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
|
| 17 |
+
# summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
|
| 18 |
|
| 19 |
|
| 20 |
config = Config()
|
src/summarize.py
CHANGED
|
@@ -2,7 +2,6 @@ import os
|
|
| 2 |
from dataclasses import asdict
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
-
import wandb
|
| 6 |
from langchain.callbacks import get_openai_callback
|
| 7 |
from langchain.chains.summarize import load_summarize_chain
|
| 8 |
from langchain.chat_models import ChatOpenAI
|
|
@@ -12,16 +11,15 @@ from langchain.text_splitter import TokenTextSplitter
|
|
| 12 |
from tqdm import tqdm
|
| 13 |
from wandb.integration.langchain import WandbTracer
|
| 14 |
|
|
|
|
| 15 |
from config import config
|
| 16 |
|
| 17 |
|
| 18 |
-
def get_data(
|
| 19 |
-
artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
|
| 20 |
-
total_episodes: int = None,
|
| 21 |
-
):
|
| 22 |
podcast_artifact = wandb.use_artifact(artifact_name, type="dataset")
|
| 23 |
-
podcast_artifact_dir = podcast_artifact.download(config.
|
| 24 |
-
|
|
|
|
| 25 |
if total_episodes is not None:
|
| 26 |
df = df.iloc[:total_episodes]
|
| 27 |
return df
|
|
@@ -77,15 +75,14 @@ if __name__ == "__main__":
|
|
| 77 |
# initialize wandb tracer
|
| 78 |
WandbTracer.init(
|
| 79 |
{
|
| 80 |
-
"project":
|
| 81 |
-
"name": "summarize_3",
|
| 82 |
"job_type": "summarize",
|
| 83 |
"config": asdict(config),
|
| 84 |
}
|
| 85 |
)
|
| 86 |
|
| 87 |
# get scraped data
|
| 88 |
-
df = get_data(artifact_name=config.yt_podcast_data_artifact, total_episodes=
|
| 89 |
|
| 90 |
summaries = []
|
| 91 |
with get_openai_callback() as cb:
|
|
@@ -110,15 +107,17 @@ if __name__ == "__main__":
|
|
| 110 |
|
| 111 |
df["summary"] = summaries
|
| 112 |
|
| 113 |
-
#
|
| 114 |
-
path_to_save = os.path.join(config.root_data_dir, "
|
| 115 |
df.to_csv(path_to_save)
|
| 116 |
-
|
|
|
|
|
|
|
| 117 |
artifact.add_file(path_to_save)
|
| 118 |
wandb.log_artifact(artifact)
|
| 119 |
|
| 120 |
# create wandb table
|
| 121 |
table = wandb.Table(dataframe=df)
|
| 122 |
-
wandb.log({"
|
| 123 |
|
| 124 |
WandbTracer.finish()
|
|
|
|
| 2 |
from dataclasses import asdict
|
| 3 |
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
from langchain.callbacks import get_openai_callback
|
| 6 |
from langchain.chains.summarize import load_summarize_chain
|
| 7 |
from langchain.chat_models import ChatOpenAI
|
|
|
|
| 11 |
from tqdm import tqdm
|
| 12 |
from wandb.integration.langchain import WandbTracer
|
| 13 |
|
| 14 |
+
import wandb
|
| 15 |
from config import config
|
| 16 |
|
| 17 |
|
| 18 |
+
def get_data(artifact_name: str, total_episodes: int = None):
|
|
|
|
|
|
|
|
|
|
| 19 |
podcast_artifact = wandb.use_artifact(artifact_name, type="dataset")
|
| 20 |
+
podcast_artifact_dir = podcast_artifact.download(config.root_artifact_dir)
|
| 21 |
+
filename = artifact_name.split(":")[0].split("/")[-1]
|
| 22 |
+
df = pd.read_csv(os.path.join(podcast_artifact_dir, f"{filename}.csv"))
|
| 23 |
if total_episodes is not None:
|
| 24 |
df = df.iloc[:total_episodes]
|
| 25 |
return df
|
|
|
|
| 75 |
# initialize wandb tracer
|
| 76 |
WandbTracer.init(
|
| 77 |
{
|
| 78 |
+
"project": config.project_name,
|
|
|
|
| 79 |
"job_type": "summarize",
|
| 80 |
"config": asdict(config),
|
| 81 |
}
|
| 82 |
)
|
| 83 |
|
| 84 |
# get scraped data
|
| 85 |
+
df = get_data(artifact_name=config.yt_podcast_data_artifact, total_episodes=2)
|
| 86 |
|
| 87 |
summaries = []
|
| 88 |
with get_openai_callback() as cb:
|
|
|
|
| 107 |
|
| 108 |
df["summary"] = summaries
|
| 109 |
|
| 110 |
+
# save data
|
| 111 |
+
path_to_save = os.path.join(config.root_data_dir, "summarized_podcasts.csv")
|
| 112 |
df.to_csv(path_to_save)
|
| 113 |
+
|
| 114 |
+
# log to wandb artifact
|
| 115 |
+
artifact = wandb.Artifact("summarized_podcasts", type="dataset")
|
| 116 |
artifact.add_file(path_to_save)
|
| 117 |
wandb.log_artifact(artifact)
|
| 118 |
|
| 119 |
# create wandb table
|
| 120 |
table = wandb.Table(dataframe=df)
|
| 121 |
+
wandb.log({"summarized_podcasts": table})
|
| 122 |
|
| 123 |
WandbTracer.finish()
|