Spaces:
Runtime error
Runtime error
| from pathlib import Path | |
| DOCS_FOLDER = Path("documents") | |
| def download_lectures(docs_folder=DOCS_FOLDER): | |
| import os | |
| import subprocess | |
| if not os.path.exists(docs_folder): | |
| os.makedirs(docs_folder, exist_ok=True) | |
| lecture_titles = get_lecture_titles() | |
| lecture_md_urls = list_lecture_md_urls(lecture_titles) | |
| for idx, url in lecture_md_urls.items(): | |
| filename = "documents/lecture-{}.md".format(str(idx).zfill(2)) | |
| if not os.path.exists(filename): | |
| subprocess.run(["wget","-O", filename, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| def list_lecture_md_urls(lecture_titles): | |
| lecture_md_url_base = "https://raw.githubusercontent.com/full-stack-deep-learning/website/main/docs/course/2022/" | |
| lecture_md_urls = {idx: lecture_md_url_base + title + "/index.md" for idx, title in lecture_titles.items()} | |
| return lecture_md_urls | |
| def get_lecture_titles(): | |
| lecture_titles = { | |
| 1: "lecture-1-course-vision-and-when-to-use-ml", | |
| 2: "lecture-2-development-infrastructure-and-tooling", | |
| 3: "lecture-3-troubleshooting-and-testing", | |
| 4: "lecture-4-data-management", | |
| 5: "lecture-5-deployment", | |
| 6: "lecture-6-continual-learning", | |
| 7: "lecture-7-foundation-models", | |
| 8: "lecture-8-teams-and-pm", | |
| 9: "lecture-9-ethics" | |
| } | |
| return lecture_titles | |
| def produce_documents(docs_folder=DOCS_FOLDER): | |
| """Assumes the documents are on disk already.""" | |
| import os | |
| from pathlib import Path | |
| import shutil | |
| import string | |
| import srt | |
| if not os.path.exists(docs_folder): | |
| os.makedirs(docs_folder, exist_ok=True) | |
| lecture_md_filenames = [elem for elem in os.listdir(docs_folder) if "lecture" in elem] | |
| lecture_titles = get_lecture_titles() | |
| lecture_texts = {} | |
| for fn in lecture_md_filenames: | |
| idx = int("".join(elem for elem in fn if elem in string.digits)) | |
| lecture_md_path = docs_folder / fn | |
| with open(lecture_md_path) as f: | |
| lecture = f.read() | |
| lecture_texts[idx] = lecture | |
| from langchain.text_splitter import CharacterTextSplitter | |
| text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| lecture_texts_split = {idx: text_splitter.split_text(lecture_text) for idx, lecture_text in lecture_texts.items()} | |
| website_url_base = "https://fullstackdeeplearning.com/course/2022/" | |
| source_urls = {idx: website_url_base + title for idx, title in lecture_titles.items()} | |
| source_urls_split = {"source": [source_urls[idx]] * len(splits) for idx, splits in lecture_texts_split.items()} | |
| lecture_texts_flat = [split for lecture_text in lecture_texts_split.values() for split in lecture_text] | |
| source_urls_flat = [{"source": source_urls[idx]} for idx, lecture_text in lecture_texts_split.items() for split in lecture_text] | |
| srt_filenames = list(sorted([elem for elem in os.listdir(docs_folder) if elem.endswith(".srt")])) | |
| srt_urls = get_srt_urls() | |
| srt_texts_flat, srt_metadatas_flat = [], [] | |
| for fn in srt_filenames: | |
| idx = int("".join(elem for elem in fn if elem in string.digits)) | |
| srt_url = srt_urls[idx] | |
| srt_text_path = docs_folder / fn | |
| with open(srt_text_path) as f: | |
| srt_text = "\n".join(f.readlines()) | |
| subtitles = list(srt.parse(srt_text)) | |
| texts, metadatas = create_srt_texts_and_metadatas(subtitles, srt_url) | |
| srt_texts_flat += texts | |
| srt_metadatas_flat += metadatas | |
| texts_flat = lecture_texts_flat + srt_texts_flat | |
| metadatas_flat = source_urls_flat + srt_metadatas_flat | |
| return texts_flat, metadatas_flat | |
| def create_srt_texts_and_metadatas(subtitles, base_url): | |
| query_params_format = "&t={start}s" | |
| texts, metadatas = [], [] | |
| for subtitle in subtitles: | |
| raw_text = subtitle.content | |
| text = subtitle.content.strip() | |
| start = timestamp_from_timedelta(subtitle.start) | |
| url = base_url + query_params_format.format(start=start) | |
| texts.append(text) | |
| metadatas.append({"source": url}) | |
| return texts, metadatas | |
| def timestamp_from_timedelta(timedelta): | |
| return int(timedelta.total_seconds()) | |
| def get_srt_urls(): | |
| return { | |
| 1: "https://www.youtube.com/watch?v=-Iob-FW5jVM", | |
| 2: "https://www.youtube.com/watch?v=BPYOsDCZbno", | |
| 3: "https://www.youtube.com/watch?v=RLemHNAO5Lw", | |
| 4: "https://www.youtube.com/watch?v=Jlm4oqW41vY", | |
| 5: "https://www.youtube.com/watch?v=W3hKjXg7fXM", | |
| 6: "https://www.youtube.com/watch?v=nra0Tt3a-Oc", | |
| 7: "https://www.youtube.com/watch?v=Rm11UeGwGgk", | |
| 8: "https://www.youtube.com/watch?v=a54xH6nT4Sw", | |
| 9: "https://www.youtube.com/watch?v=7FQpbYTqjAA" | |
| } | |
| if __name__ == "__main__": | |
| download_lectures() | |
| texts, metadatas = produce_documents() | |
| print(texts[-1]) | |
| print(metadatas[-1]) | |