| | import os |
| | import random |
| | from pathlib import Path |
| | from langchain_ollama import OllamaLLM |
| |
|
| | |
| | INPUT_DIR = "content" |
| | OUTPUT_DIR = "synthetic_content" |
| | LOG_FILE = "synthetic_processed_files.log" |
| | MODEL_NAME = "llama3" |
| |
|
| | TARGET_COUNT_PER_TYPE = 60 |
| | MIN_SYNTHETIC_PER_FILE = 1 |
| |
|
| | |
| | |
| | llm = OllamaLLM(model=MODEL_NAME, temperature=0.7) |
| |
|
| | def load_processed_files(): |
| | """Wczytuje listę już przetworzonych plików.""" |
| | if not Path(LOG_FILE).exists(): |
| | return set() |
| | return set(Path(LOG_FILE).read_text(encoding='utf-8').splitlines()) |
| |
|
| | def save_to_log(file_path): |
| | """Zapisuje ścieżkę pliku do logu.""" |
| | with open(LOG_FILE, "a", encoding="utf-8") as f: |
| | f.write(f"{file_path}\n") |
| |
|
| | def get_files_by_category(input_path): |
| | categories = {} |
| | for item in input_path.iterdir(): |
| | if item.is_dir(): |
| | files = list(item.glob("*.txt")) |
| | if files: |
| | categories[item.name] = files |
| | return categories |
| |
|
| | def calculate_variants_map(files, target_total): |
| | current_count = len(files) |
| | assignments = {f: MIN_SYNTHETIC_PER_FILE for f in files} |
| | current_total_projected = current_count + (current_count * MIN_SYNTHETIC_PER_FILE) |
| | missing = target_total - current_total_projected |
| |
|
| | if missing <= 0: |
| | return assignments |
| |
|
| | base_add = missing // current_count |
| | remainder = missing % current_count |
| | for f in files: |
| | assignments[f] += base_add |
| | for f in random.sample(files, remainder): |
| | assignments[f] += 1 |
| | return assignments |
| |
|
| | def generate_synthetic_text(text): |
| | """Generuje tekst, wymuszając brak komentarzy od AI.""" |
| | prompt = f"""[SYSTEM: You are a raw data generator. Return ONLY the document text. No conversational fillers.] |
| | SOURCE DOCUMENT TO TRANSFORM: |
| | {text[:3500]} |
| | |
| | TASK: |
| | 1. Create a synthetic version of this document. |
| | 2. Fill all placeholders/blanks with realistic Polish data. |
| | 3. Replace all existing names, dates, and numbers with new ones. |
| | 4. Add minor OCR errors (swapped letters, missing spaces). |
| | 5. Output MUST be in Polish. |
| | |
| | OUTPUT ONLY THE TRANSFORMED TEXT. DO NOT EXPLAIN. DO NOT SAY "HERE IS THE TEXT". |
| | --- |
| | SYNTHETIC TEXT START:""" |
| |
|
| | try: |
| | response = llm.invoke(prompt) |
| | |
| | clean_text = response.replace("SYNTHETIC TEXT START:", "").strip() |
| | |
| | clean_text = clean_text.replace("```text", "").replace("```", "").strip() |
| | return clean_text |
| | except Exception as e: |
| | print(f" ❌ Błąd AI: {e}") |
| | return None |
| |
|
| | def main(): |
| | input_path = Path(INPUT_DIR) |
| | output_path = Path(OUTPUT_DIR) |
| | processed_files = load_processed_files() |
| |
|
| | if not input_path.exists(): |
| | print(f"❌ Brak folderu {INPUT_DIR}") |
| | return |
| |
|
| | print("🔍 Analiza struktury i historii...") |
| | categories = get_files_by_category(input_path) |
| | if not categories: |
| | return |
| |
|
| | max_files = max(len(files) for files in categories.values()) |
| | final_target = TARGET_COUNT_PER_TYPE if TARGET_COUNT_PER_TYPE > 0 else max_files |
| | if final_target < max_files: |
| | final_target = max_files + (max_files * MIN_SYNTHETIC_PER_FILE) |
| |
|
| | total_generated = 0 |
| |
|
| | for cat_name, files in categories.items(): |
| | target_dir = output_path / cat_name |
| | target_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | files_to_process = [f for f in files if str(f) not in processed_files] |
| | |
| | if not files_to_process: |
| | print(f"✅ Kategoria [{cat_name}] już w pełni przetworzona.") |
| | continue |
| |
|
| | augment_plan = calculate_variants_map(files_to_process, final_target) |
| | print(f"\n📂 Kategoria: [{cat_name}] (Przetwarzanie {len(files_to_process)} nowych plików)") |
| |
|
| | for file_path in files_to_process: |
| | try: |
| | original_text = file_path.read_text(encoding='utf-8') |
| | except: |
| | continue |
| |
|
| | |
| | (target_dir / file_path.name).write_text(original_text, encoding='utf-8') |
| |
|
| | num_variants = augment_plan[file_path] |
| | print(f" 📄 {file_path.name} ({num_variants} wariantów)", end=" ", flush=True) |
| |
|
| | for i in range(1, num_variants + 1): |
| | new_text = generate_synthetic_text(original_text) |
| | if new_text: |
| | new_name = f"{file_path.stem}_synth_{i}.txt" |
| | (target_dir / new_name).write_text(new_text, encoding='utf-8') |
| | total_generated += 1 |
| | print(".", end="", flush=True) |
| | |
| | |
| | save_to_log(str(file_path)) |
| | print(" Gotowe") |
| |
|
| | print(f"\n✅ Zakończono! Wygenerowano {total_generated} nowych plików.") |
| |
|
| | if __name__ == "__main__": |
| | try: |
| | main() |
| | except KeyboardInterrupt: |
| | print("\n🛑 Zatrzymano ręcznie.") |