|
|
from datasets import load_dataset |
|
|
import json |
|
|
from tqdm import tqdm |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def download_data(base_url, num_shards): |
|
|
|
|
|
print("Downloading data...") |
|
|
urls = [base_url.format(i=i) for i in range(num_shards)] |
|
|
dataset = load_dataset("webdataset", data_files={"train": urls}, split="train", streaming=True) |
|
|
return dataset |
|
|
|
|
|
def download_data_from_kaggle(path): |
|
|
|
|
|
|
|
|
print("Downloading data...") |
|
|
path = kagglehub.dataset_download("manann/quotes-500k") |
|
|
|
|
|
print("Path to dataset files:", path) |
|
|
|
|
|
return path |
|
|
|
|
|
|
|
|
|
|
|
def extract_prompts(dataset, jsonl_file_path): |
|
|
|
|
|
prompts = {} |
|
|
print('Extracting data to:', jsonl_file_path) |
|
|
|
|
|
with open(jsonl_file_path, 'w') as f: |
|
|
with tqdm(desc="Processing prompts", unit=" prompt") as pbar: |
|
|
for index, row in enumerate(dataset): |
|
|
prompts[index] = row['json']['prompt'] |
|
|
f.write(json.dumps(prompts[index]) + '\n') |
|
|
|
|
|
pbar.update(1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_data_from_csv(csv_path): |
|
|
|
|
|
df = pd.read_csv(csv_path) |
|
|
df['quote'] |
|
|
quotes = df['quote'].tolist() |
|
|
|
|
|
return quotes |
|
|
|
|
|
|
|
|
def load_quotes_from_csv(file_path): |
|
|
print('Loading quotes from:', file_path) |
|
|
prompts = [] |
|
|
quotes_df = pd.read_csv(file_path) |
|
|
quotes_df['quote'] = quotes_df['quote'] + quotes_df['author'].apply(lambda x: f" - {x}" if pd.notna(x) else "") |
|
|
quotes = quotes_df["quote"].astype(str).tolist() |
|
|
print("Quotes loaded:", len(quotes)) |
|
|
print("First quote:", quotes[0][:100]) |
|
|
print("Data loaded successfully.") |
|
|
return quotes |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
csv_file_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\data\quotes_new.csv" |
|
|
download_data_from_kaggle(csv_file_path) |
|
|
read_data_from_csv(csv_file_path) |