|
|
""" |
|
|
DOMAIN-SPECIFIC DATASETS - İLERİ SEVİYE HUGGING FACE |
|
|
==================================================== |
|
|
|
|
|
Bu modülde öğrenecekleriniz: |
|
|
1. Bilimsel Makaleler (arXiv, PubMed) - Academic datasets |
|
|
2. Kod Datasets (The Stack, CodeParrot) - Programming datasets |
|
|
3. Finansal Analiz Datasets - Finance & Business |
|
|
4. Tıbbi/Sağlık Datasets - Medical & Healthcare |
|
|
5. Domain-specific preprocessing |
|
|
6. Custom tokenization |
|
|
7. Domain adaptation techniques |
|
|
""" |
|
|
|
|
|
from datasets import Dataset, load_dataset, DatasetDict |
|
|
import numpy as np |
|
|
import json |
|
|
from typing import Dict, List |
|
|
import time |
|
|
from collections import Counter |
|
|
import re |
|
|
|
|
|
print("="*70) |
|
|
print("🔬 DOMAIN-SPECIFIC DATASETS - İLERİ SEVİYE") |
|
|
print("="*70) |
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("1. BİLİMSEL MAKALELER - ACADEMIC DATASETS") |
|
|
print("="*70) |
|
|
|
|
|
|
|
|
def generate_scientific_papers(num_samples=1000): |
|
|
""" |
|
|
Bilimsel makale formatında sentetik veri |
|
|
""" |
|
|
domains = ['Physics', 'Computer Science', 'Biology', 'Mathematics', 'Chemistry'] |
|
|
|
|
|
def gen(): |
|
|
for i in range(num_samples): |
|
|
domain = np.random.choice(domains) |
|
|
|
|
|
|
|
|
abstract = f"This paper presents a novel approach to {domain.lower()} research. " \ |
|
|
f"We propose a methodology that addresses key challenges in the field. " \ |
|
|
f"Our experimental results show significant improvements over baseline methods. " \ |
|
|
f"The proposed framework demonstrates applicability across multiple scenarios." |
|
|
|
|
|
yield { |
|
|
'id': f'arxiv.{i:06d}', |
|
|
'title': f'Advanced Methods in {domain} Research: A Comprehensive Study {i}', |
|
|
'abstract': abstract, |
|
|
'authors': [f'Author {j}' for j in range(np.random.randint(2, 6))], |
|
|
'domain': domain, |
|
|
'year': np.random.randint(2015, 2025), |
|
|
'citations': np.random.randint(0, 500), |
|
|
'keywords': [f'keyword{j}' for j in range(np.random.randint(3, 8))], |
|
|
'full_text': abstract + " " + abstract * np.random.randint(5, 15) |
|
|
} |
|
|
|
|
|
return Dataset.from_generator(gen) |
|
|
|
|
|
print("\n📚 Bilimsel Makale Dataset'i Oluşturuluyor...") |
|
|
scientific_dataset = generate_scientific_papers(2000) |
|
|
|
|
|
print(f"✅ {len(scientific_dataset)} bilimsel makale yüklendi") |
|
|
print(f"\nÖrnek makale:") |
|
|
sample = scientific_dataset[0] |
|
|
print(f" ID: {sample['id']}") |
|
|
print(f" Başlık: {sample['title']}") |
|
|
print(f" Domain: {sample['domain']}") |
|
|
print(f" Yazar sayısı: {len(sample['authors'])}") |
|
|
print(f" Yıl: {sample['year']}") |
|
|
print(f" Atıf sayısı: {sample['citations']}") |
|
|
print(f" Abstract: {sample['abstract'][:150]}...") |
|
|
|
|
|
|
|
|
print("\n📊 Domain Dağılımı:") |
|
|
domains = [ex['domain'] for ex in scientific_dataset] |
|
|
domain_counts = Counter(domains) |
|
|
for domain, count in domain_counts.most_common(): |
|
|
pct = (count / len(scientific_dataset)) * 100 |
|
|
print(f" {domain}: {count} ({pct:.1f}%)") |
|
|
|
|
|
|
|
|
print("\n📅 Yıllara Göre Yayın Sayısı:") |
|
|
years = [ex['year'] for ex in scientific_dataset] |
|
|
year_counts = Counter(years) |
|
|
for year in sorted(year_counts.keys())[-5:]: |
|
|
print(f" {year}: {year_counts[year]} makale") |
|
|
|
|
|
|
|
|
citations = [ex['citations'] for ex in scientific_dataset] |
|
|
print(f"\n📈 Atıf İstatistikleri:") |
|
|
print(f" Ortalama: {np.mean(citations):.1f}") |
|
|
print(f" Median: {np.median(citations):.1f}") |
|
|
print(f" En çok atıf: {np.max(citations)}") |
|
|
|
|
|
|
|
|
print("\n🔧 Bilimsel Text Preprocessing:") |
|
|
|
|
|
def preprocess_scientific_text(examples): |
|
|
""" |
|
|
Bilimsel metin için özel preprocessing |
|
|
""" |
|
|
processed = [] |
|
|
|
|
|
for text in examples['abstract']: |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
|
|
|
text = re.sub(r'[^\w\s\.]', '', text) |
|
|
|
|
|
|
|
|
text = ' '.join(text.split()) |
|
|
|
|
|
processed.append(text) |
|
|
|
|
|
return { |
|
|
'abstract_clean': processed, |
|
|
'abstract_length': [len(t) for t in processed], |
|
|
'word_count': [len(t.split()) for t in processed] |
|
|
} |
|
|
|
|
|
scientific_processed = scientific_dataset.map( |
|
|
preprocess_scientific_text, |
|
|
batched=True, |
|
|
batch_size=500, |
|
|
desc="Preprocessing scientific texts" |
|
|
) |
|
|
|
|
|
print(f"✅ {len(scientific_processed)} makale işlendi") |
|
|
print(f"\nÖrnek işlenmiş abstract:") |
|
|
print(f" Original: {scientific_processed[0]['abstract'][:100]}...") |
|
|
print(f" Cleaned: {scientific_processed[0]['abstract_clean'][:100]}...") |
|
|
print(f" Word count: {scientific_processed[0]['word_count']}") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("2. KOD DATASETS - PROGRAMMING & SOFTWARE") |
|
|
print("="*70) |
|
|
|
|
|
|
|
|
def generate_code_dataset(num_samples=1000): |
|
|
""" |
|
|
Çeşitli programlama dilleri için kod örnekleri |
|
|
""" |
|
|
languages = ['Python', 'JavaScript', 'Java', 'C++', 'Go', 'Rust'] |
|
|
|
|
|
code_templates = { |
|
|
'Python': '''def {func_name}({params}): |
|
|
""" |
|
|
{docstring} |
|
|
""" |
|
|
result = {body} |
|
|
return result''', |
|
|
|
|
|
'JavaScript': '''function {func_name}({params}) {{ |
|
|
// {docstring} |
|
|
const result = {body}; |
|
|
return result; |
|
|
}}''', |
|
|
|
|
|
'Java': '''public {return_type} {func_name}({params}) {{ |
|
|
// {docstring} |
|
|
{return_type} result = {body}; |
|
|
return result; |
|
|
}}''', |
|
|
} |
|
|
|
|
|
def gen(): |
|
|
for i in range(num_samples): |
|
|
lang = np.random.choice(languages) |
|
|
|
|
|
|
|
|
func_name = f"process_data_{i}" |
|
|
params = "data, config" |
|
|
docstring = f"Process data using method {i}" |
|
|
body = "data * 2 + config" |
|
|
|
|
|
if lang in code_templates: |
|
|
code = code_templates[lang].format( |
|
|
func_name=func_name, |
|
|
params=params, |
|
|
docstring=docstring, |
|
|
body=body, |
|
|
return_type='int' if lang == 'Java' else '' |
|
|
) |
|
|
else: |
|
|
code = f"// {lang} code example\n{func_name}({params})" |
|
|
|
|
|
yield { |
|
|
'id': f'code_{i:06d}', |
|
|
'language': lang, |
|
|
'code': code, |
|
|
'func_name': func_name, |
|
|
'lines_of_code': len(code.split('\n')), |
|
|
'has_docstring': 'docstring' in code.lower(), |
|
|
'complexity': np.random.choice(['low', 'medium', 'high']), |
|
|
'repo': f'github.com/user/repo_{i % 100}', |
|
|
'stars': np.random.randint(0, 10000) |
|
|
} |
|
|
|
|
|
return Dataset.from_generator(gen) |
|
|
|
|
|
print("\n💻 Kod Dataset'i Oluşturuluyor...") |
|
|
code_dataset = generate_code_dataset(2000) |
|
|
|
|
|
print(f"✅ {len(code_dataset)} kod örneği yüklendi") |
|
|
print(f"\nÖrnek kod:") |
|
|
code_sample = code_dataset[0] |
|
|
print(f" ID: {code_sample['id']}") |
|
|
print(f" Dil: {code_sample['language']}") |
|
|
print(f" Satır sayısı: {code_sample['lines_of_code']}") |
|
|
print(f" Karmaşıklık: {code_sample['complexity']}") |
|
|
print(f"\n Kod:\n{code_sample['code']}\n") |
|
|
|
|
|
|
|
|
print("\n📊 Programlama Dili Dağılımı:") |
|
|
languages = [ex['language'] for ex in code_dataset] |
|
|
lang_counts = Counter(languages) |
|
|
for lang, count in lang_counts.most_common(): |
|
|
pct = (count / len(code_dataset)) * 100 |
|
|
print(f" {lang}: {count} ({pct:.1f}%)") |
|
|
|
|
|
|
|
|
print("\n📈 Kod Metrikleri:") |
|
|
loc_values = [ex['lines_of_code'] for ex in code_dataset] |
|
|
print(f" Ortalama satır sayısı: {np.mean(loc_values):.1f}") |
|
|
print(f" Median satır sayısı: {np.median(loc_values):.1f}") |
|
|
|
|
|
has_docstring = sum([1 for ex in code_dataset if ex['has_docstring']]) |
|
|
print(f" Docstring oranı: {(has_docstring/len(code_dataset)*100):.1f}%") |
|
|
|
|
|
|
|
|
print("\n🔧 Kod Preprocessing:") |
|
|
|
|
|
def preprocess_code(examples): |
|
|
""" |
|
|
Kod için özel preprocessing |
|
|
""" |
|
|
def extract_functions(code): |
|
|
|
|
|
funcs = re.findall(r'def\s+(\w+)|function\s+(\w+)|public\s+\w+\s+(\w+)', code) |
|
|
return [f for group in funcs for f in group if f] |
|
|
|
|
|
def count_comments(code): |
|
|
|
|
|
return len(re.findall(r'#|//|/\*|\*/', code)) |
|
|
|
|
|
return { |
|
|
'functions': [extract_functions(code) for code in examples['code']], |
|
|
'comment_count': [count_comments(code) for code in examples['code']], |
|
|
'code_chars': [len(code) for code in examples['code']], |
|
|
'code_tokens': [len(code.split()) for code in examples['code']] |
|
|
} |
|
|
|
|
|
code_processed = code_dataset.map( |
|
|
preprocess_code, |
|
|
batched=True, |
|
|
batch_size=500, |
|
|
desc="Analyzing code" |
|
|
) |
|
|
|
|
|
print(f"✅ {len(code_processed)} kod örneği analiz edildi") |
|
|
print(f"\nÖrnek analiz:") |
|
|
print(f" Fonksiyonlar: {code_processed[0]['functions']}") |
|
|
print(f" Yorum sayısı: {code_processed[0]['comment_count']}") |
|
|
print(f" Token sayısı: {code_processed[0]['code_tokens']}") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("3. FİNANSAL ANALİZ DATASETS") |
|
|
print("="*70) |
|
|
|
|
|
|
|
|
def generate_financial_dataset(num_samples=1000): |
|
|
""" |
|
|
Finansal haber ve analiz dataset'i |
|
|
""" |
|
|
companies = ['TechCorp', 'FinanceBank', 'RetailCo', 'EnergyInc', 'HealthMed'] |
|
|
sentiments = ['positive', 'negative', 'neutral'] |
|
|
categories = ['earnings', 'merger', 'product_launch', 'scandal', 'expansion'] |
|
|
|
|
|
def gen(): |
|
|
for i in range(num_samples): |
|
|
company = np.random.choice(companies) |
|
|
sentiment = np.random.choice(sentiments) |
|
|
category = np.random.choice(categories) |
|
|
|
|
|
|
|
|
if sentiment == 'positive': |
|
|
text = f"{company} announces strong quarterly earnings, exceeding market expectations. " \ |
|
|
f"Stock prices surged following the announcement. Analysts remain optimistic." |
|
|
elif sentiment == 'negative': |
|
|
text = f"{company} faces challenges in the current market. " \ |
|
|
f"Quarterly results fell short of expectations. Investors express concern." |
|
|
else: |
|
|
text = f"{company} maintains steady performance in Q{i%4+1}. " \ |
|
|
f"Market reaction remains moderate. Company outlook unchanged." |
|
|
|
|
|
yield { |
|
|
'id': f'fin_{i:06d}', |
|
|
'company': company, |
|
|
'text': text, |
|
|
'sentiment': sentiment, |
|
|
'category': category, |
|
|
'date': f'2024-{(i%12)+1:02d}-{(i%28)+1:02d}', |
|
|
'stock_change': np.random.uniform(-10, 10), |
|
|
'volume': np.random.randint(1000000, 10000000), |
|
|
'market_cap': np.random.uniform(1e9, 100e9), |
|
|
'sector': np.random.choice(['Tech', 'Finance', 'Retail', 'Energy', 'Healthcare']) |
|
|
} |
|
|
|
|
|
return Dataset.from_generator(gen) |
|
|
|
|
|
print("\n💰 Finansal Dataset Oluşturuluyor...") |
|
|
financial_dataset = generate_financial_dataset(2000) |
|
|
|
|
|
print(f"✅ {len(financial_dataset)} finansal kayıt yüklendi") |
|
|
print(f"\nÖrnek finansal kayıt:") |
|
|
fin_sample = financial_dataset[0] |
|
|
print(f" ID: {fin_sample['id']}") |
|
|
print(f" Şirket: {fin_sample['company']}") |
|
|
print(f" Sentiment: {fin_sample['sentiment']}") |
|
|
print(f" Kategori: {fin_sample['category']}") |
|
|
print(f" Hisse değişimi: {fin_sample['stock_change']:.2f}%") |
|
|
print(f" Metin: {fin_sample['text'][:120]}...") |
|
|
|
|
|
|
|
|
print("\n📊 Sentiment Dağılımı:") |
|
|
sentiments = [ex['sentiment'] for ex in financial_dataset] |
|
|
sent_counts = Counter(sentiments) |
|
|
for sent, count in sent_counts.items(): |
|
|
pct = (count / len(financial_dataset)) * 100 |
|
|
print(f" {sent.capitalize()}: {count} ({pct:.1f}%)") |
|
|
|
|
|
|
|
|
print("\n🏢 Şirket Bazlı Analiz:") |
|
|
companies = [ex['company'] for ex in financial_dataset] |
|
|
company_counts = Counter(companies) |
|
|
for company, count in company_counts.most_common(): |
|
|
avg_change = np.mean([ex['stock_change'] for ex in financial_dataset if ex['company'] == company]) |
|
|
print(f" {company}: {count} haber, ortalama değişim: {avg_change:+.2f}%") |
|
|
|
|
|
|
|
|
print("\n🔧 Finansal Text Preprocessing:") |
|
|
|
|
|
def preprocess_financial_text(examples): |
|
|
""" |
|
|
Finansal metin için özel preprocessing |
|
|
""" |
|
|
def extract_numbers(text): |
|
|
|
|
|
numbers = re.findall(r'\d+\.?\d*%?', text) |
|
|
return numbers |
|
|
|
|
|
def extract_financial_terms(text): |
|
|
|
|
|
terms = ['earnings', 'stock', 'market', 'quarterly', 'revenue', |
|
|
'profit', 'loss', 'growth', 'decline'] |
|
|
count = sum([1 for term in terms if term in text.lower()]) |
|
|
return count |
|
|
|
|
|
return { |
|
|
'numbers_found': [extract_numbers(text) for text in examples['text']], |
|
|
'financial_term_count': [extract_financial_terms(text) for text in examples['text']], |
|
|
'text_length': [len(text) for text in examples['text']], |
|
|
'has_percentage': ['%' in text for text in examples['text']] |
|
|
} |
|
|
|
|
|
financial_processed = financial_dataset.map( |
|
|
preprocess_financial_text, |
|
|
batched=True, |
|
|
batch_size=500, |
|
|
desc="Processing financial texts" |
|
|
) |
|
|
|
|
|
print(f"✅ {len(financial_processed)} finansal kayıt işlendi") |
|
|
print(f"\nÖrnek analiz:") |
|
|
print(f" Sayılar: {financial_processed[0]['numbers_found']}") |
|
|
print(f" Finansal terim sayısı: {financial_processed[0]['financial_term_count']}") |
|
|
print(f" Yüzde var mı: {financial_processed[0]['has_percentage']}") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("4. TIBBİ/SAĞLIK DATASETS") |
|
|
print("="*70) |
|
|
|
|
|
|
|
|
def generate_medical_dataset(num_samples=1000): |
|
|
""" |
|
|
Tıbbi notlar ve tanılar |
|
|
""" |
|
|
conditions = ['Diabetes', 'Hypertension', 'Asthma', 'Arthritis', 'Migraine'] |
|
|
treatments = ['Medication', 'Physical Therapy', 'Surgery', 'Lifestyle Changes'] |
|
|
severities = ['mild', 'moderate', 'severe'] |
|
|
|
|
|
def gen(): |
|
|
for i in range(num_samples): |
|
|
condition = np.random.choice(conditions) |
|
|
treatment = np.random.choice(treatments) |
|
|
severity = np.random.choice(severities) |
|
|
|
|
|
|
|
|
note = f"Patient presents with {severity} {condition.lower()}. " \ |
|
|
f"Symptoms include relevant clinical findings. " \ |
|
|
f"Recommended treatment: {treatment}. " \ |
|
|
f"Follow-up scheduled. Patient advised on preventive measures." |
|
|
|
|
|
yield { |
|
|
'id': f'med_{i:06d}', |
|
|
'patient_id': f'P{i:05d}', |
|
|
'condition': condition, |
|
|
'severity': severity, |
|
|
'treatment': treatment, |
|
|
'note': note, |
|
|
'age': np.random.randint(18, 90), |
|
|
'gender': np.random.choice(['M', 'F']), |
|
|
'visit_date': f'2024-{(i%12)+1:02d}-{(i%28)+1:02d}', |
|
|
'diagnosis_confidence': np.random.uniform(0.7, 1.0), |
|
|
'follow_up_required': np.random.choice([True, False]) |
|
|
} |
|
|
|
|
|
return Dataset.from_generator(gen) |
|
|
|
|
|
print("\n🏥 Tıbbi Dataset Oluşturuluyor...") |
|
|
medical_dataset = generate_medical_dataset(2000) |
|
|
|
|
|
print(f"✅ {len(medical_dataset)} tıbbi kayıt yüklendi") |
|
|
print(f"\nÖrnek tıbbi kayıt:") |
|
|
med_sample = medical_dataset[0] |
|
|
print(f" ID: {med_sample['id']}") |
|
|
print(f" Hasta ID: {med_sample['patient_id']}") |
|
|
print(f" Durum: {med_sample['condition']}") |
|
|
print(f" Şiddet: {med_sample['severity']}") |
|
|
print(f" Tedavi: {med_sample['treatment']}") |
|
|
print(f" Yaş: {med_sample['age']}") |
|
|
print(f" Tanı güveni: {med_sample['diagnosis_confidence']:.2f}") |
|
|
print(f" Not: {med_sample['note'][:100]}...") |
|
|
|
|
|
|
|
|
print("\n📊 Tıbbi Durum Dağılımı:") |
|
|
conditions = [ex['condition'] for ex in medical_dataset] |
|
|
cond_counts = Counter(conditions) |
|
|
for cond, count in cond_counts.most_common(): |
|
|
pct = (count / len(medical_dataset)) * 100 |
|
|
print(f" {cond}: {count} ({pct:.1f}%)") |
|
|
|
|
|
|
|
|
print("\n⚠️ Şiddet Dağılımı:") |
|
|
severities = [ex['severity'] for ex in medical_dataset] |
|
|
sev_counts = Counter(severities) |
|
|
for sev, count in sorted(sev_counts.items()): |
|
|
pct = (count / len(medical_dataset)) * 100 |
|
|
print(f" {sev.capitalize()}: {count} ({pct:.1f}%)") |
|
|
|
|
|
|
|
|
print("\n👥 Yaş Grubu Analizi:") |
|
|
ages = [ex['age'] for ex in medical_dataset] |
|
|
age_groups = { |
|
|
'18-30': sum([1 for age in ages if 18 <= age <= 30]), |
|
|
'31-50': sum([1 for age in ages if 31 <= age <= 50]), |
|
|
'51-70': sum([1 for age in ages if 51 <= age <= 70]), |
|
|
'71+': sum([1 for age in ages if age > 70]) |
|
|
} |
|
|
for group, count in age_groups.items(): |
|
|
pct = (count / len(ages)) * 100 |
|
|
print(f" {group}: {count} ({pct:.1f}%)") |
|
|
|
|
|
|
|
|
print("\n🔧 Tıbbi Text Preprocessing (PHI Removal):") |
|
|
|
|
|
def preprocess_medical_text(examples): |
|
|
""" |
|
|
Tıbbi metin için özel preprocessing |
|
|
PHI (Protected Health Information) temizleme simülasyonu |
|
|
""" |
|
|
def anonymize_text(text, patient_id): |
|
|
|
|
|
text = text.replace(patient_id, '[PATIENT_ID]') |
|
|
|
|
|
|
|
|
text = re.sub(r'\d{4}-\d{2}-\d{2}', '[DATE]', text) |
|
|
|
|
|
return text |
|
|
|
|
|
def extract_medical_entities(text): |
|
|
|
|
|
terms = ['patient', 'symptoms', 'treatment', 'diagnosis', |
|
|
'medication', 'therapy', 'condition'] |
|
|
count = sum([1 for term in terms if term in text.lower()]) |
|
|
return count |
|
|
|
|
|
return { |
|
|
'note_anonymized': [ |
|
|
anonymize_text(note, pid) |
|
|
for note, pid in zip(examples['note'], examples['patient_id']) |
|
|
], |
|
|
'medical_entity_count': [extract_medical_entities(note) for note in examples['note']], |
|
|
'note_length': [len(note) for note in examples['note']], |
|
|
'requires_follow_up': examples['follow_up_required'] |
|
|
} |
|
|
|
|
|
medical_processed = medical_dataset.map( |
|
|
preprocess_medical_text, |
|
|
batched=True, |
|
|
batch_size=500, |
|
|
desc="Anonymizing medical records" |
|
|
) |
|
|
|
|
|
print(f"✅ {len(medical_processed)} tıbbi kayıt anonimleştirildi") |
|
|
print(f"\nÖrnek anonimleştirilmiş not:") |
|
|
print(f" Orijinal: {medical_processed[0]['note'][:100]}...") |
|
|
print(f" Anonimleştirilmiş: {medical_processed[0]['note_anonymized'][:100]}...") |
|
|
print(f" Tıbbi entity sayısı: {medical_processed[0]['medical_entity_count']}") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("5. DOMAIN-SPECIFIC TOKENIZATION") |
|
|
print("="*70) |
|
|
|
|
|
print("\n🔤 Domain-Specific Tokenization Stratejileri:") |
|
|
|
|
|
|
|
|
print("\n1️⃣ Bilimsel Metin Tokenization:") |
|
|
scientific_sample = scientific_dataset[0]['abstract'] |
|
|
print(f" Orijinal: {scientific_sample[:80]}...") |
|
|
|
|
|
|
|
|
words = scientific_sample.split() |
|
|
print(f" Word tokens: {len(words)} kelime") |
|
|
print(f" İlk 5 token: {words[:5]}") |
|
|
|
|
|
|
|
|
sentences = scientific_sample.split('.') |
|
|
print(f" Sentence tokens: {len([s for s in sentences if s.strip()])} cümle") |
|
|
|
|
|
|
|
|
print("\n2️⃣ Kod Tokenization:") |
|
|
code_sample = code_dataset[0]['code'] |
|
|
print(f" Kod:\n{code_sample}") |
|
|
|
|
|
|
|
|
lines = code_sample.split('\n') |
|
|
print(f" Satır sayısı: {len(lines)}") |
|
|
|
|
|
|
|
|
code_tokens = re.findall(r'\w+|[^\w\s]', code_sample) |
|
|
print(f" Token sayısı: {len(code_tokens)}") |
|
|
print(f" İlk 10 token: {code_tokens[:10]}") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("6. CROSS-DOMAIN DATASET BİRLEŞTİRME") |
|
|
print("="*70) |
|
|
|
|
|
print("\n🔄 Farklı Domain'lerden Dataset Birleştirme:") |
|
|
|
|
|
|
|
|
sci_subset = scientific_dataset.select(range(100)) |
|
|
code_subset = code_dataset.select(range(100)) |
|
|
fin_subset = financial_dataset.select(range(100)) |
|
|
|
|
|
|
|
|
def normalize_scientific(example): |
|
|
return { |
|
|
'text': example['abstract'], |
|
|
'domain': 'scientific', |
|
|
'metadata': { |
|
|
'type': example['domain'], |
|
|
'year': example['year'] |
|
|
} |
|
|
} |
|
|
|
|
|
def normalize_code(example): |
|
|
return { |
|
|
'text': example['code'], |
|
|
'domain': 'code', |
|
|
'metadata': { |
|
|
'language': example['language'], |
|
|
'lines': example['lines_of_code'] |
|
|
} |
|
|
} |
|
|
|
|
|
def normalize_financial(example): |
|
|
return { |
|
|
'text': example['text'], |
|
|
'domain': 'financial', |
|
|
'metadata': { |
|
|
'sentiment': example['sentiment'], |
|
|
'company': example['company'] |
|
|
} |
|
|
} |
|
|
|
|
|
print("\n📦 Dataset'leri normalize ediyoruz...") |
|
|
sci_norm = sci_subset.map(normalize_scientific, remove_columns=sci_subset.column_names) |
|
|
code_norm = code_subset.map(normalize_code, remove_columns=code_subset.column_names) |
|
|
fin_norm = fin_subset.map(normalize_financial, remove_columns=fin_subset.column_names) |
|
|
|
|
|
|
|
|
from datasets import concatenate_datasets |
|
|
multi_domain = concatenate_datasets([sci_norm, code_norm, fin_norm]) |
|
|
|
|
|
print(f"✅ Multi-domain dataset: {len(multi_domain)} örnek") |
|
|
print(f"\nDomain dağılımı:") |
|
|
domains = [ex['domain'] for ex in multi_domain] |
|
|
domain_dist = Counter(domains) |
|
|
for domain, count in domain_dist.items(): |
|
|
print(f" {domain}: {count}") |
|
|
|
|
|
print(f"\nÖrnek multi-domain kayıtlar:") |
|
|
for i in range(3): |
|
|
ex = multi_domain[i * 100] |
|
|
print(f"\n {i+1}. Domain: {ex['domain']}") |
|
|
print(f" Text: {ex['text'][:80]}...") |
|
|
print(f" Metadata: {ex['metadata']}") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("7. DOMAIN ADAPTATION TEKNİKLERİ") |
|
|
print("="*70) |
|
|
|
|
|
print("\n🎯 Domain Adaptation Stratejileri:") |
|
|
|
|
|
|
|
|
print("\n1️⃣ Domain-Specific Vocabulary Analysis:") |
|
|
|
|
|
def analyze_domain_vocabulary(dataset, text_column, domain_name): |
|
|
""" |
|
|
Domain-specific kelime dağarcığı analizi |
|
|
""" |
|
|
all_words = [] |
|
|
for example in dataset: |
|
|
words = example[text_column].lower().split() |
|
|
all_words.extend(words) |
|
|
|
|
|
vocab_counts = Counter(all_words) |
|
|
|
|
|
return { |
|
|
'domain': domain_name, |
|
|
'total_words': len(all_words), |
|
|
'unique_words': len(vocab_counts), |
|
|
'top_10_words': vocab_counts.most_common(10) |
|
|
} |
|
|
|
|
|
|
|
|
sci_vocab = analyze_domain_vocabulary( |
|
|
scientific_dataset.select(range(500)), |
|
|
'abstract', |
|
|
'Scientific' |
|
|
) |
|
|
code_vocab = analyze_domain_vocabulary( |
|
|
code_dataset.select(range(500)), |
|
|
'code', |
|
|
'Code' |
|
|
) |
|
|
fin_vocab = analyze_domain_vocabulary( |
|
|
financial_dataset.select(range(500)), |
|
|
'text', |
|
|
'Financial' |
|
|
) |
|
|
|
|
|
print("\n📚 Domain Vocabulary İstatistikleri:") |
|
|
for vocab in [sci_vocab, code_vocab, fin_vocab]: |
|
|
print(f"\n {vocab['domain']}:") |
|
|
print(f" Toplam kelime: {vocab['total_words']:,}") |
|
|
print(f" Benzersiz kelime: {vocab['unique_words']:,}") |
|
|
print(f" Vocabulary zenginliği: {vocab['unique_words']/vocab['total_words']:.3f}") |
|
|
print(f" Top 5 kelime: {[w for w, c in vocab['top_10_words'][:5]]}") |
|
|
|
|
|
|
|
|
print("\n2️⃣ Domain-Specific Data Augmentation:") |
|
|
|
|
|
def augment_scientific_text(example): |
|
|
""" |
|
|
Bilimsel metin için data augmentation |
|
|
""" |
|
|
text = example['abstract'] |
|
|
|
|
|
|
|
|
augmented = text.replace('novel', 'innovative') |
|
|
augmented = augmented.replace('propose', 'present') |
|
|
augmented = augmented.replace('demonstrate', 'show') |
|
|
|
|
|
return { |
|
|
**example, |
|
|
'abstract_augmented': augmented |
|
|
} |
|
|
|
|
|
print("\n Bilimsel metin augmentation örneği:") |
|
|
aug_sample = augment_scientific_text(scientific_dataset[0]) |
|
|
print(f" Original: {aug_sample['abstract'][:100]}...") |
|
|
print(f" Augmented: {aug_sample['abstract_augmented'][:100]}...") |
|
|
|
|
|
|
|
|
print("\n3️⃣ Domain-Specific Filtering:") |
|
|
|
|
|
def filter_high_quality_scientific(example): |
|
|
""" |
|
|
Yüksek kaliteli bilimsel makaleleri filtrele |
|
|
""" |
|
|
return ( |
|
|
example['citations'] > 50 and |
|
|
example['year'] >= 2020 and |
|
|
len(example['abstract'].split()) > 100 |
|
|
) |
|
|
|
|
|
high_quality_sci = scientific_dataset.filter( |
|
|
filter_high_quality_scientific, |
|
|
desc="Filtering high-quality papers" |
|
|
) |
|
|
|
|
|
print(f"\n Kaliteli makale filtreleme:") |
|
|
print(f" Orijinal: {len(scientific_dataset)} makale") |
|
|
print(f" Filtrelenmiş: {len(high_quality_sci)} makale") |
|
|
print(f" Oran: {len(high_quality_sci)/len(scientific_dataset)*100:.1f}%") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("8. DOMAIN-SPECIFIC EVALUATION METRİKLERİ") |
|
|
print("="*70) |
|
|
|
|
|
print("\n📊 Domain-Specific Kalite Metrikleri:") |
|
|
|
|
|
def calculate_domain_metrics(dataset, domain_name): |
|
|
""" |
|
|
Domain-specific kalite metrikleri |
|
|
""" |
|
|
if domain_name == 'scientific': |
|
|
|
|
|
avg_citations = np.mean([ex['citations'] for ex in dataset]) |
|
|
avg_authors = np.mean([len(ex['authors']) for ex in dataset]) |
|
|
recent_papers = sum([1 for ex in dataset if ex['year'] >= 2020]) |
|
|
|
|
|
return { |
|
|
'domain': domain_name, |
|
|
'avg_citations': avg_citations, |
|
|
'avg_authors': avg_authors, |
|
|
'recent_ratio': recent_papers / len(dataset) |
|
|
} |
|
|
|
|
|
elif domain_name == 'code': |
|
|
|
|
|
avg_loc = np.mean([ex['lines_of_code'] for ex in dataset]) |
|
|
has_doc = sum([1 for ex in dataset if ex['has_docstring']]) |
|
|
high_stars = sum([1 for ex in dataset if ex['stars'] > 1000]) |
|
|
|
|
|
return { |
|
|
'domain': domain_name, |
|
|
'avg_lines_of_code': avg_loc, |
|
|
'documentation_ratio': has_doc / len(dataset), |
|
|
'popular_ratio': high_stars / len(dataset) |
|
|
} |
|
|
|
|
|
elif domain_name == 'financial': |
|
|
|
|
|
sentiments = [ex['sentiment'] for ex in dataset] |
|
|
sent_dist = Counter(sentiments) |
|
|
avg_change = np.mean([ex['stock_change'] for ex in dataset]) |
|
|
|
|
|
return { |
|
|
'domain': domain_name, |
|
|
'sentiment_distribution': dict(sent_dist), |
|
|
'avg_stock_change': avg_change, |
|
|
'volatility': np.std([ex['stock_change'] for ex in dataset]) |
|
|
} |
|
|
|
|
|
print("\n1️⃣ Scientific Metrics:") |
|
|
sci_metrics = calculate_domain_metrics(scientific_dataset, 'scientific') |
|
|
for key, value in sci_metrics.items(): |
|
|
print(f" {key}: {value}") |
|
|
|
|
|
print("\n2️⃣ Code Metrics:") |
|
|
code_metrics = calculate_domain_metrics(code_dataset, 'code') |
|
|
for key, value in code_metrics.items(): |
|
|
print(f" {key}: {value}") |
|
|
|
|
|
print("\n3️⃣ Financial Metrics:") |
|
|
fin_metrics = calculate_domain_metrics(financial_dataset, 'financial') |
|
|
for key, value in fin_metrics.items(): |
|
|
print(f" {key}: {value}") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("9. BEST PRACTICES - DOMAIN-SPECIFIC DATASETS") |
|
|
print("="*70) |
|
|
|
|
|
print(""" |
|
|
✅ BİLİMSEL DATASETS: |
|
|
- Citation metadata ekle |
|
|
- Abstract + full text ayrımı |
|
|
- Domain/field classification |
|
|
- Author disambiguation |
|
|
- Reference parsing |
|
|
- LaTeX formül handling |
|
|
|
|
|
✅ KOD DATASETS: |
|
|
- Programlama dili ayrımı |
|
|
- Syntax parsing |
|
|
- Docstring extraction |
|
|
- Repository metadata |
|
|
- License bilgisi |
|
|
- Code quality metrics (complexity, coverage) |
|
|
|
|
|
✅ FİNANSAL DATASETS: |
|
|
- Sentiment annotation |
|
|
- Entity recognition (companies, people) |
|
|
- Temporal information |
|
|
- Numerical data extraction |
|
|
- Market data integration |
|
|
- Real-time updates |
|
|
|
|
|
✅ TIBBİ DATASETS: |
|
|
- PHI (Protected Health Information) removal |
|
|
- HIPAA compliance |
|
|
- Clinical terminology standardization |
|
|
- ICD code mapping |
|
|
- Anonymization |
|
|
- Ethical considerations |
|
|
|
|
|
✅ GENEL PRENSİPLER: |
|
|
- Domain expertise gerekir |
|
|
- Specialized tokenization |
|
|
- Domain-specific validation |
|
|
- Quality filtering |
|
|
- Ethical guidelines takip et |
|
|
- License ve copyright kontrol et |
|
|
|
|
|
✅ DATA QUALITY: |
|
|
- Domain experts ile validate et |
|
|
- Inter-annotator agreement hesapla |
|
|
- Bias analysis yap |
|
|
- Coverage analysis |
|
|
- Statistical validation |
|
|
- Regular updates |
|
|
""") |
|
|
|
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("✅ BÖLÜM 2 TAMAMLANDI!") |
|
|
print("="*70) |
|
|
print(f""" |
|
|
Bu bölümde öğrendikleriniz: |
|
|
✓ Bilimsel makale datasets ({len(scientific_dataset)} örnek) |
|
|
✓ Kod datasets ({len(code_dataset)} örnek) |
|
|
✓ Finansal analiz datasets ({len(financial_dataset)} örnek) |
|
|
✓ Tıbbi/sağlık datasets ({len(medical_dataset)} örnek) |
|
|
✓ Domain-specific preprocessing |
|
|
✓ Cross-domain dataset birleştirme |
|
|
✓ Domain adaptation teknikleri |
|
|
✓ Domain-specific evaluation metrikleri |
|
|
|
|
|
📊 ÜRETİLEN DATASETS: |
|
|
- Scientific: {len(scientific_dataset):,} makale |
|
|
- Code: {len(code_dataset):,} kod örneği |
|
|
- Financial: {len(financial_dataset):,} finansal kayıt |
|
|
- Medical: {len(medical_dataset):,} tıbbi kayıt |
|
|
- Multi-domain: {len(multi_domain):,} birleştirilmiş örnek |
|
|
|
|
|
📚 SONRAKI BÖLÜM: İleri Teknikler |
|
|
- Dataset streaming (büyük datasets için) |
|
|
- Custom data collators |
|
|
- Feature extraction ve transformation |
|
|
- Dataset preprocessing pipelines |
|
|
- Advanced filtering strategies |
|
|
""") |
|
|
|
|
|
print("\n🚀 Harika! İkinci bölümü tamamladık!") |
|
|
print("Üçüncü bölüme (İleri Teknikler) geçelim mi?") |
|
|
|