| import os |
| import random |
| from pathlib import Path |
| from langchain_ollama import OllamaLLM |
|
|
| |
| INPUT_DIR = "content" |
| OUTPUT_DIR = "synthetic_content" |
| LOG_FILE = "synthetic_processed_files.log" |
| MODEL_NAME = "llama3" |
|
|
| TARGET_COUNT_PER_TYPE = 60 |
| MIN_SYNTHETIC_PER_FILE = 1 |
|
|
| |
| |
| llm = OllamaLLM(model=MODEL_NAME, temperature=0.7) |
|
|
| def load_processed_files(): |
| """Wczytuje listę już przetworzonych plików.""" |
| if not Path(LOG_FILE).exists(): |
| return set() |
| return set(Path(LOG_FILE).read_text(encoding='utf-8').splitlines()) |
|
|
| def save_to_log(file_path): |
| """Zapisuje ścieżkę pliku do logu.""" |
| with open(LOG_FILE, "a", encoding="utf-8") as f: |
| f.write(f"{file_path}\n") |
|
|
| def get_files_by_category(input_path): |
| categories = {} |
| for item in input_path.iterdir(): |
| if item.is_dir(): |
| files = list(item.glob("*.txt")) |
| if files: |
| categories[item.name] = files |
| return categories |
|
|
| def calculate_variants_map(files, target_total): |
| current_count = len(files) |
| assignments = {f: MIN_SYNTHETIC_PER_FILE for f in files} |
| current_total_projected = current_count + (current_count * MIN_SYNTHETIC_PER_FILE) |
| missing = target_total - current_total_projected |
|
|
| if missing <= 0: |
| return assignments |
|
|
| base_add = missing // current_count |
| remainder = missing % current_count |
| for f in files: |
| assignments[f] += base_add |
| for f in random.sample(files, remainder): |
| assignments[f] += 1 |
| return assignments |
|
|
| def generate_synthetic_text(text): |
| """Generuje tekst, wymuszając brak komentarzy od AI.""" |
| prompt = f"""[SYSTEM: You are a raw data generator. Return ONLY the document text. No conversational fillers.] |
| SOURCE DOCUMENT TO TRANSFORM: |
| {text[:3500]} |
| |
| TASK: |
| 1. Create a synthetic version of this document. |
| 2. Fill all placeholders/blanks with realistic Polish data. |
| 3. Replace all existing names, dates, and numbers with new ones. |
| 4. Add minor OCR errors (swapped letters, missing spaces). |
| 5. Output MUST be in Polish. |
| |
| OUTPUT ONLY THE TRANSFORMED TEXT. DO NOT EXPLAIN. DO NOT SAY "HERE IS THE TEXT". |
| --- |
| SYNTHETIC TEXT START:""" |
|
|
| try: |
| response = llm.invoke(prompt) |
| |
| clean_text = response.replace("SYNTHETIC TEXT START:", "").strip() |
| |
| clean_text = clean_text.replace("```text", "").replace("```", "").strip() |
| return clean_text |
| except Exception as e: |
| print(f" ❌ Błąd AI: {e}") |
| return None |
|
|
| def main(): |
| input_path = Path(INPUT_DIR) |
| output_path = Path(OUTPUT_DIR) |
| processed_files = load_processed_files() |
|
|
| if not input_path.exists(): |
| print(f"❌ Brak folderu {INPUT_DIR}") |
| return |
|
|
| print("🔍 Analiza struktury i historii...") |
| categories = get_files_by_category(input_path) |
| if not categories: |
| return |
|
|
| max_files = max(len(files) for files in categories.values()) |
| final_target = TARGET_COUNT_PER_TYPE if TARGET_COUNT_PER_TYPE > 0 else max_files |
| if final_target < max_files: |
| final_target = max_files + (max_files * MIN_SYNTHETIC_PER_FILE) |
|
|
| total_generated = 0 |
|
|
| for cat_name, files in categories.items(): |
| target_dir = output_path / cat_name |
| target_dir.mkdir(parents=True, exist_ok=True) |
|
|
| |
| files_to_process = [f for f in files if str(f) not in processed_files] |
| |
| if not files_to_process: |
| print(f"✅ Kategoria [{cat_name}] już w pełni przetworzona.") |
| continue |
|
|
| augment_plan = calculate_variants_map(files_to_process, final_target) |
| print(f"\n📂 Kategoria: [{cat_name}] (Przetwarzanie {len(files_to_process)} nowych plików)") |
|
|
| for file_path in files_to_process: |
| try: |
| original_text = file_path.read_text(encoding='utf-8') |
| except: |
| continue |
|
|
| |
| (target_dir / file_path.name).write_text(original_text, encoding='utf-8') |
|
|
| num_variants = augment_plan[file_path] |
| print(f" 📄 {file_path.name} ({num_variants} wariantów)", end=" ", flush=True) |
|
|
| for i in range(1, num_variants + 1): |
| new_text = generate_synthetic_text(original_text) |
| if new_text: |
| new_name = f"{file_path.stem}_synth_{i}.txt" |
| (target_dir / new_name).write_text(new_text, encoding='utf-8') |
| total_generated += 1 |
| print(".", end="", flush=True) |
| |
| |
| save_to_log(str(file_path)) |
| print(" Gotowe") |
|
|
| print(f"\n✅ Zakończono! Wygenerowano {total_generated} nowych plików.") |
|
|
| if __name__ == "__main__": |
| try: |
| main() |
| except KeyboardInterrupt: |
| print("\n🛑 Zatrzymano ręcznie.") |