| | import os |
| | import torch |
| | import pytesseract |
| | import json |
| | from pathlib import Path |
| | from PIL import Image |
| | from pdf2image import convert_from_path |
| | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
| |
|
| | |
| | |
| | pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract' |
| |
|
| | |
| | SUMMARIZER_DIR = Path(__file__).resolve().parent |
| | BASE_DIR = SUMMARIZER_DIR.parent |
| | MODEL_PATH = SUMMARIZER_DIR / "models" / "flan_t5_custom" |
| | VERIFY_DIR = SUMMARIZER_DIR / "scans_to_verify_summary" |
| |
|
| | |
| | device = "mps" if torch.backends.mps.is_available() else "cpu" |
| |
|
| |
|
| | def perform_ocr(file_path): |
| | """Konwertuje obraz/PDF na tekst.""" |
| | text = "" |
| | try: |
| | if file_path.suffix.lower() == ".pdf": |
| | pages = convert_from_path(file_path) |
| | for page in pages: |
| | text += pytesseract.image_to_string(page, lang='pol+eng') |
| | else: |
| | text = pytesseract.image_to_string(Image.open(file_path), lang='pol+eng') |
| | except Exception as e: |
| | print(f" [!] Błąd OCR dla {file_path.name}: {e}") |
| | return text |
| |
|
| |
|
| | def load_model(): |
| | print(f"🚀 Ładowanie modelu z: {MODEL_PATH}...") |
| | if not MODEL_PATH.exists(): |
| | raise FileNotFoundError(f"❌ Nie znaleziono modelu w {MODEL_PATH}.") |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
| | model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device) |
| |
|
| | |
| | print("\n" + "=" * 40) |
| | print("🔍 TOKENIZER VERIFICATION (Dla porównania z Flutterem)") |
| |
|
| | for word in ["Janina", "Joanna"]: |
| | encoded = tokenizer.encode(word, add_special_tokens=False) |
| | print(f" ID dla słowa '{word}': {encoded}") |
| |
|
| | |
| | test_ids = [0, 2664, 15, 1] |
| | decoded = tokenizer.decode(test_ids) |
| | print(f" Test dekodowania {test_ids}: '{decoded}'") |
| | print("=" * 40 + "\n") |
| | |
| |
|
| | return tokenizer, model |
| |
|
| |
|
| | def generate_text(prompt, tokenizer, model): |
| | |
| | inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device) |
| | input_len = inputs['input_ids'].shape[1] |
| |
|
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=128, |
| | num_beams=4, |
| | early_stopping=True |
| | ) |
| |
|
| | result = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | return result, input_len |
| |
|
| |
|
| | def main(): |
| | tokenizer, model = load_model() |
| |
|
| | if not VERIFY_DIR.exists(): |
| | os.makedirs(VERIFY_DIR) |
| | print(f"📁 Folder {VERIFY_DIR} był pusty. Wrzuć tam zdjęcia dokumentów i uruchom ponownie.") |
| | return |
| |
|
| | |
| | extensions = [".jpg", ".jpeg", ".png", ".pdf"] |
| | files = [f for f in VERIFY_DIR.glob("*") if f.suffix.lower() in extensions] |
| |
|
| | if not files: |
| | print(f"ℹ️ Brak obrazów lub plików PDF w {VERIFY_DIR}.") |
| | return |
| |
|
| | print(f"🔍 Znaleziono {len(files)} dokumentów do weryfikacji.\n") |
| |
|
| | for file_path in files: |
| | print(f"📄 PRZETWARZANIE: {file_path.name}") |
| | print("⏳ Wykonywanie OCR...") |
| |
|
| | ocr_text = perform_ocr(file_path) |
| |
|
| | if not ocr_text.strip(): |
| | print(f"⚠️ Nie udało się odczytać tekstu z {file_path.name}. Pomijam.") |
| | continue |
| |
|
| | print(f"📊 Długość tekstu OCR: {len(ocr_text)} znaków") |
| | print(f"📝 Pierwsze 100 znaków OCR: {ocr_text[:100].replace('\n', ' ')}...") |
| | print("-" * 30) |
| |
|
| | |
| | title, t_len = generate_text(f"headline: {ocr_text}", tokenizer, model) |
| | print(f"📌 TYTUŁ (Tokeny wejściowe: {t_len}):\n{title}\n") |
| |
|
| | |
| | summary, s_len = generate_text(f"summarize: {ocr_text}", tokenizer, model) |
| | print(f"📝 STRESZCZENIE (Tokeny wejściowe: {s_len}):\n{summary}") |
| | print("=" * 60 + "\n") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |