File size: 1,466 Bytes
f4e346e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import os
import pyarrow.parquet as pq
from glob import glob
from tqdm import tqdm
INPUT_DIRS = [
"books",
"fineweb",
"wikipedia",
]
OUTPUT_DIR = "merged_text"
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUT_FILE = os.path.join(OUTPUT_DIR, "corpus.txt")
def extract_text_from_parquet(path):
try:
table = pq.read_table(path)
df = table.to_pandas()
# Look for likely text column
for col in ["text", "content", "document", "article", "source"]:
if col in df.columns:
return df[col].astype(str).tolist()
# Fallback: take the first string-like column
for col in df.columns:
if df[col].dtype == object:
return df[col].astype(str).tolist()
return []
except Exception as e:
print(f"Error reading {path}: {e}")
return []
all_parquet_files = []
for d in INPUT_DIRS:
all_parquet_files.extend(glob(f"{d}/**/*.parquet", recursive=True))
print("Total parquet files found:", len(all_parquet_files))
with open(OUT_FILE, "w", encoding="utf-8") as fout:
for file in tqdm(all_parquet_files, desc="Extracting text"):
texts = extract_text_from_parquet(file)
for t in texts:
t = t.strip()
if len(t) < 50:
continue
if not any(c.isalpha() for c in t):
continue
fout.write(t + "\n\n")
print("DONE! Saved merged corpus →", OUT_FILE)
|