Spaces:
Sleeping
Sleeping
| # this file is a WIP and an attempt to locally recreate https://colab.research.google.com/drive/1-h3rPUzV-j9VzD9Rg7ZLGKEp-jMNFaje?usp=sharing | |
| # this script is not working as expected, it is not able to load the training data from the file | |
| import uuid | |
| import tqdm | |
| import json | |
| import asyncio | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_community.document_loaders import DirectoryLoader | |
| from langchain_community.document_loaders import UnstructuredMarkdownLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| CHUNK_SIZE = 1000 | |
| CHUNK_OVERLAP = CHUNK_SIZE // 2 | |
| QA_PROMPT = """\ | |
| Given the following context, you must generate questions based on only the provided context. | |
| You are to generate {n_questions} questions which should be provided in the following format: | |
| 1. QUESTION #1 | |
| 2. QUESTION #2 | |
| ... | |
| Context: | |
| {context} | |
| """ | |
| fine_tuning_data_filepath = Path("data/finetuning") | |
| fine_tuning_data_filepath.mkdir(parents=True, exist_ok=True) | |
| async def create_questions(documents, n_questions, question_generation_chain): | |
| questions = {} | |
| relevant_docs = {} | |
| for document in tqdm.tqdm(documents): | |
| context = document.page_content | |
| # get questions by invoking the question generation chain | |
| response = await question_generation_chain.ainvoke( | |
| {"context": context, "n_questions": n_questions} | |
| ) | |
| # split the response into two questions | |
| [question1, question2] = response.content.split("\n") | |
| # generate a unique id for the first question | |
| id1 = str(uuid.uuid4()) | |
| while id1 in questions: | |
| id1 = str(uuid.uuid4()) | |
| # store the first question | |
| questions[id1] = question1[2:].strip() | |
| # generate a unique id for the second question | |
| id2 = str(uuid.uuid4()) | |
| while id2 in questions: | |
| id2 = str(uuid.uuid4()) | |
| # store the second question | |
| questions[id2] = question2[2:].strip() | |
| # Store the relevant doc for each questions | |
| relevant_docs[id1] = [document.metadata["id"]] | |
| relevant_docs[id2] = [document.metadata["id"]] | |
| return questions, relevant_docs | |
| async def main(): | |
| path = "data/scraped/clean" | |
| text_loader = DirectoryLoader(path, glob="*.txt", loader_cls=UnstructuredMarkdownLoader) | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size = CHUNK_SIZE, | |
| chunk_overlap = CHUNK_OVERLAP, | |
| length_function = len | |
| ) | |
| training_documents = text_splitter.split_documents(text_loader.load()) | |
| # Add unique id to each document | |
| id_set = set() | |
| for document in training_documents: | |
| id = str(uuid.uuid4()) | |
| while id in id_set: | |
| id = uuid.uuid4() | |
| id_set.add(id) | |
| document.metadata["id"] = id | |
| TRAINING_DOC_LENGTH = len(training_documents) | |
| BREAK1 = TRAINING_DOC_LENGTH - 24 | |
| BREAK2 = TRAINING_DOC_LENGTH - 12 | |
| training_split_documents = training_documents[:TRAINING_DOC_LENGTH - 24] | |
| eval_split_documents = training_documents[BREAK1:BREAK2] | |
| test_split_documents = training_documents[BREAK2:] | |
| qa_chat_model = ChatOpenAI( | |
| model="gpt-4o-mini", | |
| temperature=0 | |
| ) | |
| qa_prompt_template = ChatPromptTemplate.from_template(QA_PROMPT) | |
| question_generation_chain = qa_prompt_template | qa_chat_model | |
| # try to load training data from file otherwise generate new data | |
| try: | |
| training_dataset = json.load(open(fine_tuning_data_filepath / "training_dataset.jsonl")) | |
| training_questions = training_dataset["questions"] | |
| training_relevant_contexts = training_dataset["relevant_contexts"] | |
| training_corpus = training_dataset["corpus"] | |
| except: | |
| training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2, question_generation_chain) | |
| training_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents} | |
| training_dataset = { | |
| "questions" : training_questions, | |
| "relevant_contexts" : training_relevant_contexts, | |
| "corpus" : training_corpus | |
| } | |
| with open(fine_tuning_data_filepath /"training_dataset.jsonl", "w") as f: | |
| json.dump(training_dataset, f) | |
| # try to load eval data from file otherwise generate new data | |
| try: | |
| eval_dataset = json.load(open(fine_tuning_data_filepath / "eval_dataset.jsonl")) | |
| eval_questions = eval_dataset["questions"] | |
| eval_relevant_contexts = eval_dataset["relevant_contexts"] | |
| eval_corpus = eval_dataset["corpus"] | |
| except: | |
| eval_questions, eval_relevant_contexts = await create_questions(eval_split_documents, 2, question_generation_chain) | |
| eval_corpus = {eval_item.metadata["id"] : eval_item.page_content for eval_item in eval_split_documents} | |
| eval_dataset = { | |
| "questions" : eval_questions, | |
| "relevant_contexts" : eval_relevant_contexts, | |
| "corpus" : eval_corpus, | |
| } | |
| with open(fine_tuning_data_filepath /"eval_dataset.jsonl", "w") as f: | |
| json.dump(eval_dataset, f) | |
| # try to load test data from file otherwise generate new data | |
| try: | |
| test_dataset = json.load(open(fine_tuning_data_filepath / "test_dataset.jsonl")) | |
| test_questions = test_dataset["questions"] | |
| test_relevant_contexts = test_dataset["relevant_contexts"] | |
| test_corpus = test_dataset["corpus"] | |
| except: | |
| test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2, question_generation_chain) | |
| test_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents} | |
| test_dataset = { | |
| "questions" : test_questions, | |
| "relevant_contexts" : test_relevant_contexts, | |
| "corpus" : test_corpus, | |
| } | |
| with open(fine_tuning_data_filepath /"test_dataset.jsonl", "w") as f: | |
| json.dump(test_dataset, f) | |
| import wandb | |
| from torch.utils.data import DataLoader | |
| from sentence_transformers import InputExample, SentenceTransformer | |
| from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss | |
| from sentence_transformers.evaluation import InformationRetrievalEvaluator | |
| from huggingface_hub import notebook_login | |
| BATCH_SIZE = 10 | |
| MODEL_ID = "Snowflake/snowflake-arctic-embed-l" | |
| model = SentenceTransformer(MODEL_ID) | |
| wandb.init(mode="disabled") | |
| corpus = training_dataset['corpus'] | |
| queries = training_dataset['questions'] | |
| relevant_docs = training_dataset['relevant_contexts'] | |
| examples = [] | |
| for query_id, query in queries.items(): | |
| doc_id = relevant_docs[query_id][0] | |
| text = corpus[doc_id] | |
| example = InputExample(texts=[query, text]) | |
| examples.append(example) | |
| loader = DataLoader( | |
| examples, batch_size=BATCH_SIZE | |
| ) | |
| matryoshka_dimensions = [768, 512, 256, 128, 64] | |
| inner_train_loss = MultipleNegativesRankingLoss(model) | |
| train_loss = MatryoshkaLoss( | |
| model, inner_train_loss, matryoshka_dims=matryoshka_dimensions | |
| ) | |
| evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs) | |
| EPOCHS = 10 | |
| warmup_steps = int(len(loader) * EPOCHS * 0.1) | |
| model.fit( | |
| train_objectives=[(loader, train_loss)], | |
| epochs=EPOCHS, | |
| warmup_steps=warmup_steps, | |
| output_path='AIE5-MidTerm-finetuned-embeddings', | |
| show_progress_bar=True, | |
| evaluator=evaluator, | |
| evaluation_steps=50 | |
| ) | |
| notebook_login() | |
| hf_username = "thomfoolery" | |
| model.push_to_hub(f"{hf_username}/AIE5-MidTerm-finetuned-embeddings") | |
| if __name__ == "__main__": | |
| load_dotenv() | |
| asyncio.run(main()) |