from datasets import load_dataset import json from tqdm import tqdm import pandas as pd # import kagglehub def download_data(base_url, num_shards): # Download the data print("Downloading data...") urls = [base_url.format(i=i) for i in range(num_shards)] dataset = load_dataset("webdataset", data_files={"train": urls}, split="train", streaming=True) return dataset def download_data_from_kaggle(path): # Download latest version print("Downloading data...") path = kagglehub.dataset_download("manann/quotes-500k") print("Path to dataset files:", path) return path def extract_prompts(dataset, jsonl_file_path): # Write data to the jsonl file prompts = {} print('Extracting data to:', jsonl_file_path) with open(jsonl_file_path, 'w') as f: with tqdm(desc="Processing prompts", unit=" prompt") as pbar: for index, row in enumerate(dataset): prompts[index] = row['json']['prompt'] f.write(json.dumps(prompts[index]) + '\n') pbar.update(1) def read_data_from_csv(csv_path): # Read data from the jsonl file df = pd.read_csv(csv_path) df['quote'] quotes = df['quote'].tolist() return quotes def load_quotes_from_csv(file_path): print('Loading quotes from:', file_path) prompts = [] quotes_df = pd.read_csv(file_path) quotes_df['quote'] = quotes_df['quote'] + quotes_df['author'].apply(lambda x: f" - {x}" if pd.notna(x) else "") quotes = quotes_df["quote"].astype(str).tolist() print("Quotes loaded:", len(quotes)) # should be 499709 print("First quote:", quotes[0][:100]) print("Data loaded successfully.") return quotes if __name__ == "__main__": csv_file_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\data\quotes_new.csv" download_data_from_kaggle(csv_file_path) read_data_from_csv(csv_file_path)