|
|
"""
|
|
|
Script 1: Extract random sentences from EN-HI and EN-PA parallel files
|
|
|
WITH PROGRESS BAR AND OPTIMIZATIONS
|
|
|
"""
|
|
|
|
|
|
import pandas as pd
|
|
|
import random
|
|
|
import ftfy
|
|
|
from langdetect import detect, LangDetectException
|
|
|
import re
|
|
|
import numpy as np
|
|
|
from pathlib import Path
|
|
|
from tqdm import tqdm
|
|
|
import time
|
|
|
|
|
|
def clean_text(text):
|
|
|
"""Basic text cleaning - optimized"""
|
|
|
if not isinstance(text, str):
|
|
|
return ""
|
|
|
|
|
|
|
|
|
if text == 'nan' or pd.isna(text):
|
|
|
return ""
|
|
|
|
|
|
text = ftfy.fix_text(text)
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
|
|
return text.strip()
|
|
|
|
|
|
def is_valid_sentence_fast(text, target_lang):
|
|
|
"""Optimized version without langdetect for initial filtering"""
|
|
|
if not text or len(text) < 20:
|
|
|
return False
|
|
|
|
|
|
|
|
|
words = text.split()
|
|
|
if len(words) < 5 or len(words) > 50:
|
|
|
return False
|
|
|
|
|
|
|
|
|
unique_chars = len(set(text))
|
|
|
if unique_chars < 7:
|
|
|
return False
|
|
|
|
|
|
|
|
|
if target_lang == 'en':
|
|
|
|
|
|
if not re.search(r'[a-zA-Z]', text):
|
|
|
return False
|
|
|
elif target_lang == 'hi':
|
|
|
|
|
|
if not re.search(r'[\u0900-\u097F]', text):
|
|
|
return False
|
|
|
elif target_lang == 'pa':
|
|
|
|
|
|
if not re.search(r'[\u0A00-\u0A7F]', text):
|
|
|
return False
|
|
|
|
|
|
return True
|
|
|
|
|
|
def is_valid_sentence_with_lang(text, target_lang, use_fast=True):
|
|
|
"""Full validation with optional langdetect"""
|
|
|
if not is_valid_sentence_fast(text, target_lang):
|
|
|
return False
|
|
|
|
|
|
|
|
|
if not use_fast:
|
|
|
try:
|
|
|
detected = detect(text)
|
|
|
lang_map = {
|
|
|
'hi': ['hi'],
|
|
|
'pa': ['pa'],
|
|
|
'en': ['en']
|
|
|
}
|
|
|
|
|
|
if target_lang in lang_map and detected not in lang_map[target_lang]:
|
|
|
if target_lang == 'en' and detected not in ['hi', 'pa', 'mr', 'gu']:
|
|
|
return True
|
|
|
elif target_lang in ['hi', 'pa'] and detected not in ['en']:
|
|
|
return True
|
|
|
return False
|
|
|
except LangDetectException:
|
|
|
pass
|
|
|
|
|
|
return True
|
|
|
|
|
|
def extract_from_parallel_csv_optimized(input_csv, output_dir, en_samples, other_samples, other_lang_code):
|
|
|
"""
|
|
|
Extract random sentences from parallel CSV - OPTIMIZED
|
|
|
"""
|
|
|
print(f"\n{'='*60}")
|
|
|
print(f"Processing {input_csv}...")
|
|
|
print(f"Target: {en_samples} EN, {other_samples} {other_lang_code}")
|
|
|
print('='*60)
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
print("Reading CSV file...")
|
|
|
try:
|
|
|
df = pd.read_csv(input_csv, on_bad_lines='skip')
|
|
|
except Exception as e:
|
|
|
print(f"Error reading {input_csv}: {e}")
|
|
|
|
|
|
try:
|
|
|
df = pd.read_csv(input_csv, encoding='latin-1', on_bad_lines='skip')
|
|
|
except:
|
|
|
print(f"Failed to read {input_csv}")
|
|
|
return [], []
|
|
|
|
|
|
print(f"Loaded {len(df):,} rows")
|
|
|
print(f"Columns: {list(df.columns)}")
|
|
|
|
|
|
|
|
|
src_col = 'src' if 'src' in df.columns else df.columns[1]
|
|
|
tgt_col = 'tgt' if 'tgt' in df.columns else df.columns[2]
|
|
|
print(f"Source: {src_col}, Target: {tgt_col}")
|
|
|
|
|
|
|
|
|
print("\nCleaning data...")
|
|
|
df_clean = df.copy()
|
|
|
|
|
|
|
|
|
valid_src = []
|
|
|
valid_src_indices = []
|
|
|
print(f"Processing {src_col} column...")
|
|
|
for idx, text in tqdm(enumerate(df[src_col].astype(str)), total=len(df), desc="Cleaning English"):
|
|
|
cleaned = clean_text(text)
|
|
|
if len(cleaned) > 10:
|
|
|
valid_src.append(cleaned)
|
|
|
valid_src_indices.append(idx)
|
|
|
|
|
|
|
|
|
valid_tgt = []
|
|
|
valid_tgt_indices = []
|
|
|
print(f"\nProcessing {tgt_col} column...")
|
|
|
for idx, text in tqdm(enumerate(df[tgt_col].astype(str)), total=len(df), desc=f"Cleaning {other_lang_code}"):
|
|
|
cleaned = clean_text(text)
|
|
|
if len(cleaned) > 10:
|
|
|
valid_tgt.append(cleaned)
|
|
|
valid_tgt_indices.append(idx)
|
|
|
|
|
|
print(f"\nAfter cleaning:")
|
|
|
print(f" Valid English sentences: {len(valid_src):,}")
|
|
|
print(f" Valid {other_lang_code} sentences: {len(valid_tgt):,}")
|
|
|
|
|
|
|
|
|
print("\nFast filtering sentences...")
|
|
|
fast_valid_en = []
|
|
|
for text in tqdm(valid_src, desc="Filtering English"):
|
|
|
if is_valid_sentence_fast(text, 'en'):
|
|
|
fast_valid_en.append(text)
|
|
|
|
|
|
fast_valid_other = []
|
|
|
for text in tqdm(valid_tgt, desc=f"Filtering {other_lang_code}"):
|
|
|
if is_valid_sentence_fast(text, other_lang_code):
|
|
|
fast_valid_other.append(text)
|
|
|
|
|
|
print(f"\nAfter fast filtering:")
|
|
|
print(f" English: {len(fast_valid_en):,}")
|
|
|
print(f" {other_lang_code}: {len(fast_valid_other):,}")
|
|
|
|
|
|
|
|
|
|
|
|
if len(fast_valid_en) >= en_samples and len(fast_valid_other) >= other_samples:
|
|
|
final_en = fast_valid_en
|
|
|
final_other = fast_valid_other
|
|
|
print("Using fast-filtered sentences (skipping langdetect)")
|
|
|
else:
|
|
|
|
|
|
print("\nApplying language detection on subset...")
|
|
|
|
|
|
|
|
|
sample_en = fast_valid_en[:100000] if len(fast_valid_en) > 100000 else fast_valid_en
|
|
|
sample_other = fast_valid_other[:100000] if len(fast_valid_other) > 100000 else fast_valid_other
|
|
|
|
|
|
final_en = []
|
|
|
print("Validating English with langdetect...")
|
|
|
for text in tqdm(sample_en, desc="English langdetect"):
|
|
|
if is_valid_sentence_with_lang(text, 'en', use_fast=False):
|
|
|
final_en.append(text)
|
|
|
|
|
|
final_other = []
|
|
|
print(f"Validating {other_lang_code} with langdetect...")
|
|
|
for text in tqdm(sample_other, desc=f"{other_lang_code} langdetect"):
|
|
|
if is_valid_sentence_with_lang(text, other_lang_code, use_fast=False):
|
|
|
final_other.append(text)
|
|
|
|
|
|
print(f"\nAfter langdetect:")
|
|
|
print(f" English: {len(final_en):,}")
|
|
|
print(f" {other_lang_code}: {len(final_other):,}")
|
|
|
|
|
|
|
|
|
en_samples = min(en_samples, len(final_en))
|
|
|
other_samples = min(other_samples, len(final_other))
|
|
|
|
|
|
print(f"\nSampling {en_samples:,} English and {other_samples:,} {other_lang_code} sentences...")
|
|
|
|
|
|
sampled_en = random.sample(final_en, en_samples)
|
|
|
sampled_other = random.sample(final_other, other_samples)
|
|
|
|
|
|
|
|
|
output_dir = Path(output_dir)
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
en_filename = output_dir / f'en_{other_lang_code}_english.txt'
|
|
|
with open(en_filename, 'w', encoding='utf-8') as f:
|
|
|
for sentence in sampled_en:
|
|
|
f.write(f"{sentence}\n")
|
|
|
|
|
|
|
|
|
other_filename = output_dir / f'en_{other_lang_code}_{other_lang_code}.txt'
|
|
|
with open(other_filename, 'w', encoding='utf-8') as f:
|
|
|
for sentence in sampled_other:
|
|
|
f.write(f"{sentence}\n")
|
|
|
|
|
|
elapsed = time.time() - start_time
|
|
|
print(f"\n✓ Saved {en_samples:,} English sentences to: {en_filename}")
|
|
|
print(f"✓ Saved {other_samples:,} {other_lang_code} sentences to: {other_filename}")
|
|
|
print(f"⏱️ Processing time: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
|
|
|
|
|
|
return sampled_en, sampled_other
|
|
|
|
|
|
def main():
|
|
|
|
|
|
EN_HI_CSV = "en-hi.csv"
|
|
|
EN_PA_CSV = "en-pa.csv"
|
|
|
OUTPUT_DIR = "./extracted_sentences"
|
|
|
|
|
|
|
|
|
|
|
|
EN_HI_EN_SAMPLES = 150000
|
|
|
EN_HI_HI_SAMPLES = 300000
|
|
|
EN_PA_EN_SAMPLES = 150000
|
|
|
EN_PA_PA_SAMPLES = 300000
|
|
|
|
|
|
print("="*70)
|
|
|
print("MULTILINGUAL DATA EXTRACTION TOOL")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
random.seed(42)
|
|
|
np.random.seed(42)
|
|
|
|
|
|
|
|
|
print("\n" + "="*70)
|
|
|
print("EXTRACTING FROM ENGLISH-HINDI DATASET")
|
|
|
print("="*70)
|
|
|
|
|
|
en_hi_en, en_hi_hi = extract_from_parallel_csv_optimized(
|
|
|
EN_HI_CSV, OUTPUT_DIR,
|
|
|
EN_HI_EN_SAMPLES, EN_HI_HI_SAMPLES, 'hi'
|
|
|
)
|
|
|
|
|
|
|
|
|
print("\n" + "="*70)
|
|
|
print("EXTRACTING FROM ENGLISH-PUNJABI DATASET")
|
|
|
print("="*70)
|
|
|
|
|
|
en_pa_en, en_pa_pa = extract_from_parallel_csv_optimized(
|
|
|
EN_PA_CSV, OUTPUT_DIR,
|
|
|
EN_PA_EN_SAMPLES, EN_PA_PA_SAMPLES, 'pa'
|
|
|
)
|
|
|
|
|
|
|
|
|
print("\n" + "="*70)
|
|
|
print("CREATING COMBINED ENGLISH FILE")
|
|
|
print("="*70)
|
|
|
|
|
|
all_english = en_hi_en + en_pa_en
|
|
|
random.shuffle(all_english)
|
|
|
|
|
|
combined_filename = Path(OUTPUT_DIR) / "combined_english.txt"
|
|
|
with open(combined_filename, 'w', encoding='utf-8') as f:
|
|
|
for sentence in all_english[:100000]:
|
|
|
f.write(f"{sentence}\n")
|
|
|
|
|
|
print(f"\n✓ Saved {min(100000, len(all_english)):,} combined English sentences")
|
|
|
|
|
|
|
|
|
print("\n" + "="*70)
|
|
|
print("EXTRACTION COMPLETE - FINAL STATISTICS")
|
|
|
print("="*70)
|
|
|
print(f"Total English sentences: {len(all_english):,}")
|
|
|
print(f"Total Hindi sentences: {len(en_hi_hi):,}")
|
|
|
print(f"Total Punjabi sentences: {len(en_pa_pa):,}")
|
|
|
|
|
|
|
|
|
summary_file = Path(OUTPUT_DIR) / "extraction_summary.txt"
|
|
|
with open(summary_file, 'w', encoding='utf-8') as f:
|
|
|
f.write("DATA EXTRACTION SUMMARY\n")
|
|
|
f.write("="*50 + "\n\n")
|
|
|
f.write(f"English-Hindi Dataset:\n")
|
|
|
f.write(f" English sentences: {len(en_hi_en):,}\n")
|
|
|
f.write(f" Hindi sentences: {len(en_hi_hi):,}\n\n")
|
|
|
f.write(f"English-Punjabi Dataset:\n")
|
|
|
f.write(f" English sentences: {len(en_pa_en):,}\n")
|
|
|
f.write(f" Punjabi sentences: {len(en_pa_pa):,}\n\n")
|
|
|
f.write(f"Combined English: {min(100000, len(all_english)):,}\n")
|
|
|
f.write(f"Total corpus size: {len(all_english) + len(en_hi_hi) + len(en_pa_pa):,} sentences\n")
|
|
|
|
|
|
print(f"\n📊 Summary saved to: {summary_file}")
|
|
|
print("\n✅ All done! Ready for corpus creation.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
try:
|
|
|
from tqdm import tqdm
|
|
|
except ImportError:
|
|
|
print("Installing tqdm for progress bars...")
|
|
|
import subprocess
|
|
|
subprocess.check_call(["pip", "install", "tqdm"])
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
main() |