paperstack_document_data_retrieval / process_syntethic_content.py
pemix09's picture
Add files using upload-large-folder tool
8fd4eb2 verified
import os
import json
from pathlib import Path
from langchain_ollama import OllamaLLM
# --- KONFIGURACJA ---
INPUT_DIR = "synthetic_content"
OUTPUT_ROOT = "synthetic_dataset"
HISTORY_FILE = "processed_synthetic_scans_contents.txt"
MODEL_NAME = "llama3"
# Definicja języków
TARGET_LANGUAGES = {
"pl": "Polish",
"en": "English",
"de": "German",
"fr": "French",
"es": "Spanish",
"it": "Italian",
"uk": "Ukrainian"
}
# Inicjalizacja LLM z niską temperaturą dla powtarzalności
llm = OllamaLLM(model=MODEL_NAME, temperature=0)
# --- OBSŁUGA HISTORII (RESUME) ---
def load_history():
if not os.path.exists(HISTORY_FILE):
return set()
with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
return set(line.strip() for line in f if line.strip())
def mark_as_done(rel_path):
with open(HISTORY_FILE, 'a', encoding='utf-8') as f:
f.write(f"{rel_path}\n")
# --- PROMPTY LLM ---
def ask_llm_json(prompt):
"""Wywołuje LLM w trybie JSON i bezpiecznie parsuje wynik."""
try:
# format="json" to kluczowa funkcja Ollama, która wymusza poprawny JSON
response = llm.invoke(prompt, format="json")
return json.loads(response)
except json.JSONDecodeError as e:
print(f"\n ⚠️ Błąd składni JSON od AI: {e}")
return None
except Exception as e:
print(f"\n ⚠️ Błąd komunikacji z LLM: {e}")
return None
def ask_llm_text(prompt):
try:
response = llm.invoke(prompt)
return response.strip().strip('"').strip("'")
except Exception:
return "Translation Error"
def get_metadata(text, hinted_type):
# Prompt z wyraźnymi instrukcjami dla formatu JSON
prompt = f"""
Analyze this document text.
Folder hint: {hinted_type}
Return ONLY a JSON object with these keys:
- "title_base": Factual title in ENGLISH (format: "[Type] - [Entity] - [Date]")
- "summary_base": Factual summary in ENGLISH (exactly 5 sentences)
- "category": One of: financial, legal, personal, health, property, other
- "info": Key details (e.g. document ID or service name)
Ensure all quotes inside the text are properly escaped.
TEXT:
{text[:3500]}
"""
return ask_llm_json(prompt)
def translate_section(text, target_lang, content_type="text"):
prompt = f"""
Translate the following {content_type} into {target_lang}.
Output ONLY the translation. No conversational text or markdown.
TEXT TO TRANSLATE:
{text}
"""
return ask_llm_text(prompt)
def save_output(root, kind, lang, subdir, filename, content):
if lang:
path = Path(root) / kind / lang / subdir
else:
path = Path(root) / kind / subdir
path.mkdir(parents=True, exist_ok=True)
with open(path / filename, "w", encoding="utf-8") as f:
f.write(str(content))
# --- GŁÓWNA LOGIKA PLIKU ---
def process_file(file_path, input_root):
rel_path = file_path.relative_to(input_root)
base_filename = rel_path.name
sub_dir = rel_path.parent
doc_type = sub_dir.name
try:
raw_text = file_path.read_text(encoding='utf-8')
except Exception as e:
print(f" ❌ Błąd odczytu pliku: {e}")
return
# 2. Generowanie metadanych (JSON)
meta = get_metadata(raw_text, doc_type)
if not meta or not isinstance(meta, dict):
print(" ❌ Błąd AI: Nie udało się wygenerować poprawnego JSONa.")
return
# 3. Zapisywanie danych podstawowych
save_output(OUTPUT_ROOT, "content", None, sub_dir, base_filename, raw_text)
save_output(OUTPUT_ROOT, "category", None, sub_dir, base_filename, meta.get("category", "other"))
save_output(OUTPUT_ROOT, "type", None, sub_dir, base_filename, doc_type)
save_output(OUTPUT_ROOT, "info", None, sub_dir, base_filename, meta.get("info", "none"))
base_title = meta.get("title_base", "Document")
base_summary = meta.get("summary_base", "No summary available.")
# 4. Tłumaczenia
print(f" 🌍 Tłumaczenie na {len(TARGET_LANGUAGES)} języków...", end="", flush=True)
for code, lang_name in TARGET_LANGUAGES.items():
# Tytuły
if code == "en":
title = base_title
else:
title = translate_section(base_title, lang_name, "title")
save_output(OUTPUT_ROOT, "titles", code, sub_dir, base_filename, title)
# Streszczenia
if code == "en":
summary = base_summary
else:
summary = translate_section(base_summary, lang_name, "summary")
save_output(OUTPUT_ROOT, "summary", code, sub_dir, base_filename, summary)
print(".", end="", flush=True)
print(" OK")
mark_as_done(str(rel_path))
def main():
input_path = Path(INPUT_DIR)
if not input_path.exists():
print(f"❌ Brak folderu wejściowego: {INPUT_DIR}")
return
processed = load_history()
print(f"📂 Historia: {len(processed)} plików już przetworzonych.")
files = list(input_path.rglob("*.txt"))
print(f"🚀 Start: {len(files)} plików do analizy.")
for f in files:
rel_path = str(f.relative_to(input_path))
if rel_path in processed:
continue
print(f"📄 Przetwarzam: {rel_path}")
try:
process_file(f, input_path)
except KeyboardInterrupt:
print("\n🛑 Przerwano ręcznie. Postęp zapisany.")
break
except Exception as e:
print(f"\n❌ Błąd krytyczny przy pliku {rel_path}: {e}")
if __name__ == "__main__":
main()