| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| import sqlite3 |
| import argparse |
| import logging |
| from pathlib import Path |
| import nltk |
| from transformers import pipeline |
| from collections import defaultdict |
| import matplotlib.pyplot as plt |
|
|
| |
| nltk.download('punkt') |
| from nltk import sent_tokenize |
|
|
| |
| def configure_logging(): |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| return logging.getLogger(__name__) |
|
|
| logger = configure_logging() |
|
|
| |
| ASPECT_LABEL_MAP = { |
| "Handlung": ["Handlung", "Plot", "Story", "Aufbau"], |
| "Charaktere": ["Charaktere", "Figuren", "Protagonisten", "Nebenfiguren", "Beziehungen"], |
| "Stil": ["Stil", "Sprachstil", "Sprache", "Erzählweise"], |
| "Emotionale Wirkung": ["Lesevergnügen", "Berührend", "Bewegend", "Begeisternd", "Spannend"], |
| "Tiefgang": ["Tiefgang", "Nachdenklich", "Philosophisch", "kritisch"], |
| "Thema & Kontext": ["Thema", "Motiv", "Zeitgeschehen", "Historischer Kontext", "Gesellschaft"], |
| "Originalität": ["Originalität", "Kreativität", "Innovativ", "Idee", 'Humor'], |
| "Recherche & Authentizität": ["Recherche", "Authentizität", "Realismus", "Fakten"] |
| } |
|
|
| ASPECT_LABEL_MAP_EN = { |
| "Plot": ["Plot", "Story", "Narrative", "Structure"], |
| "Characters": ["Characters", "Protagonists", "Antagonists", "Relationships"], |
| "Style": ["Style", "Language", "Tone", "Narration"], |
| "Emotional Impact": ["Touching", "Funny", "Exciting", "Moving", "Engaging"], |
| "Depth": ["Philosophical", "Thought-provoking", "Insightful", "Critical"], |
| "Theme & Context": ["Theme", "Motif", "Historical Context", "Social Issues"], |
| "Originality": ["Originality", "Creativity", "Innovation", "Idea"], |
| "Research & Authenticity": ["Research", "Authenticity", "Realism", "Facts"] |
| } |
|
|
| ALL_LABELS = [label for labels in ASPECT_LABEL_MAP.values() for label in labels] |
|
|
| |
|
|
| def load_reviews(db_path: Path, isbn: str) -> list: |
| conn = sqlite3.connect(db_path) |
| cursor = conn.cursor() |
| cursor.execute( |
| "SELECT id, cleaned_text, cleaned_text_en FROM reviews_und_notizen WHERE buch_isbn = ?", |
| (isbn,) |
| ) |
| rows = cursor.fetchall() |
| conn.close() |
| texts_to_analyze = [] |
| for review_id, text_de, text_en in rows: |
| if text_de and isinstance(text_de, str): |
| texts_to_analyze.append((review_id, text_de, 'de')) |
| if text_en and isinstance(text_en, str): |
| texts_to_analyze.append((review_id, text_en, 'en')) |
| return texts_to_analyze |
|
|
| |
|
|
| def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list[str] = ["de", "en"]) -> dict: |
| reviews = load_reviews(db_path, isbn) |
| reviews = [r for r in reviews if r[2] in languages] |
| if not reviews: |
| logger.warning(f"Keine gesäuberten Reviews für ISBN {isbn} in den gewählten Sprachen gefunden.") |
| return {} |
|
|
| zsl = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device, multi_label=True) |
| sent_de = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert", device=device) |
| sent_en = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device) |
|
|
| aspect_results = defaultdict(list) |
| total_aspects = 0 |
|
|
| for review_id, text, lang in reviews: |
| if not text: |
| continue |
|
|
| logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.") |
|
|
| lang_map = {'de': 'german', 'en': 'english'} |
| tokenizer = nltk.data.load(f"tokenizers/punkt/{lang_map.get(lang, 'english')}.pickle") |
| sentences = tokenizer.tokenize(text) |
|
|
| if lang == 'de': |
| aspect_map = ASPECT_LABEL_MAP |
| all_labels = ALL_LABELS |
| sent_pipeline = sent_de |
| hypothesis_template = "Dieser Satz handelt von {}." |
| elif lang == 'en': |
| aspect_map = ASPECT_LABEL_MAP_EN |
| all_labels = [label for labels in aspect_map.values() for label in labels] |
| sent_pipeline = sent_en |
| hypothesis_template = "This sentence is about {}." |
| else: |
| continue |
|
|
| for sent in sentences: |
| if not sent.strip() or len(sent) < 15: |
| continue |
|
|
| result = zsl(sent, candidate_labels=all_labels, hypothesis_template=hypothesis_template) |
|
|
| main_label = "" |
| best_score = 0.0 |
| for label, score in zip(result["labels"], result["scores"]): |
| if score > 0.8: |
| main_label = next((k for k, v in aspect_map.items() if label in v), label) |
| best_score = score |
| break |
|
|
| if not main_label: |
| continue |
|
|
| ml_sentiment = sent_pipeline(sent)[0] |
| ml_score = ml_sentiment['score'] if ml_sentiment['label'].upper().startswith('POS') else -ml_sentiment['score'] |
| final_score = ml_score |
| final_label = 'POS' if final_score > 0.1 else 'NEG' if final_score < -0.1 else 'NEU' |
|
|
| print( |
| f"Review {review_id} ({lang}) | Satz: {sent}\n" |
| f" Aspekt: {main_label} (via '{result['labels'][0]}', {best_score:.2f}) | " |
| f"ML: {ml_sentiment['label']}({ml_sentiment['score']:.2f}) -> Final: {final_label}({final_score:.2f})" |
| ) |
|
|
| aspect_results[main_label].append(final_score) |
| total_aspects += 1 |
|
|
| logger.info(f"Total aspects found: {total_aspects}") |
| return aspect_results |
|
|
| def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"): |
| output_dir.mkdir(parents=True, exist_ok=True) |
| aspects = list(aspect_results.keys()) |
| avg_scores = [sum(scores) / len(scores) for scores in aspect_results.values()] |
| colors = ['green' if score > 0.1 else 'red' if score < -0.1 else 'gray' for score in avg_scores] |
| import matplotlib.pyplot as plt |
| plt.figure(figsize=(10, 6)) |
| bars = plt.barh(aspects, avg_scores, color=colors) |
| plt.axvline(x=0, color='black', linewidth=0.8) |
| plt.xlabel("Durchschnittlicher Sentiment-Score") |
| plt.title("Sentiment-Analyse pro Aspekt") |
| for bar, score in zip(bars, avg_scores): |
| plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2, |
| f"{score:.2f}", va='center') |
| plt.tight_layout() |
| plt.gca().invert_yaxis() |
| output_path = output_dir / filename |
| plt.savefig(output_path, dpi=300) |
| plt.close() |
| logger.info(f"Diagramm gespeichert unter: {output_path}") |
|
|
| |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Quick-Win ABSA ohne SentiWS") |
| parser.add_argument("--db-path", required=True, help="Pfad zur SQLite-Datenbank") |
| parser.add_argument("--isbn", required=True, help="ISBN des Buchs") |
| parser.add_argument("--gpu", action="store_true", help="GPU verwenden (device=0)") |
| parser.add_argument("--languages", nargs="+", choices=["de", "en"], default=["de", "en"], |
| help="Sprachen der Reviews, z. B. --languages de oder --languages de en") |
| args = parser.parse_args() |
|
|
| device = 0 if args.gpu else -1 |
| aspect_results = analyze_quickwin( |
| Path(args.db_path), args.isbn, |
| device=device, |
| languages=args.languages |
| ) |
|
|
| if aspect_results: |
| output_dir = Path("output") |
| visualize_aspects(aspect_results, output_dir) |
| else: |
| logger.info("Keine Aspekt-Daten zur Visualisierung verfügbar.") |
|
|