Spaces:

mutoy
/

Broadcast_paper

Sleeping

Broadcast_paper / app.py

Choi jun hyeok

update prompt

be91dcc 2 months ago

17.6 kB

	"""Flask backend for the "신문과방송" article performance prediction web app.

	This server exposes prediction and metadata endpoints that rely on the
	pre-trained artifacts produced during the offline training pipeline.
	"""
	from __future__ import annotations

	import json
	import logging
	import os
	import re
	from pathlib import Path
	from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast

	from flask import Flask, jsonify, request, send_from_directory # type: ignore[import]
	from konlpy.tag import Okt

	import joblib # type: ignore[import]
	import numpy as np
	from scipy.sparse import csr_matrix, hstack
	from sklearn.metrics.pairwise import cosine_similarity # type: ignore[import]

	from dotenv import load_dotenv
	import google.generativeai as genai

	# Optional dependency: pandas is only required for category input handling.
	try:
	import pandas as pd
	except ImportError: # pragma: no cover - pandas should be available, but we guard just in case.
	pd = None # type: ignore


	if pd is None:
	raise RuntimeError(
	"pandas is required for this application. Please install pandas in the runtime environment."
	)


	load_dotenv()

	API_KEY = os.getenv("GEMINI_API_KEY")
	if not API_KEY:
	raise RuntimeError("GEMINI_API_KEY is not set. Please define it in your .env file.")

	genai.configure(api_key=API_KEY) # type: ignore[attr-defined]
	SEO_MODEL_NAME = "gemma-3-27b-it"
	SEO_GENERATIVE_MODEL = genai.GenerativeModel(SEO_MODEL_NAME) # type: ignore[attr-defined]

	BASE_DIR = Path(__file__).resolve().parent
	ARTIFACT_DIR = BASE_DIR / "artifacts"
	DATA_DIR = BASE_DIR / "data_csv"
	CONTENTS_CSV = DATA_DIR / "contents.csv"

	logger = logging.getLogger(__name__)
	logging.basicConfig(level=logging.INFO)

	# Okt 객체를 전역적으로 초기화합니다.
	OKT = Okt()

	def okt_tokenizer(text: str) -> list[str]:
	"""
	Tokenize text using Okt to extract nouns and verbs.
	This function must be defined in the same way as in the training script
	for the TfidfVectorizer to be loaded correctly.
	"""
	if not isinstance(text, str) or not text.strip():
	return []
	# `stem=True`는 단어를 원형으로 복원해주는 옵션입니다 (e.g., '달렸다' -> '달리다').
	# 훈련 스크립트와 동일하게 유지해야 합니다.
	return [
	word
	for word, tag in OKT.pos(text, stem=True)
	if tag in ["Noun", "Verb"]
	]


	# Ensure unpickling finds the tokenizer when joblib loads artifacts created with the
	# same function defined at module scope in the training script.
	import __main__ as _main # noqa: E402 (import placed here intentionally)

	setattr(_main, "okt_tokenizer", okt_tokenizer)

	def _resolve_artifact_path(filename: str) -> Path:
	"""Return the most likely path for a persisted artifact.

	The training pipeline may save artifacts either in the project root or in an
	`artifacts/` sub-directory. We attempt both locations for convenience and to
	provide a clear error if the file cannot be found.
	"""
	direct_path = BASE_DIR / filename
	if direct_path.exists():
	return direct_path

	artifacts_path = ARTIFACT_DIR / filename
	if artifacts_path.exists():
	return artifacts_path

	search_locations = [str(direct_path), str(artifacts_path)]
	raise FileNotFoundError(
	f"Artifact '{filename}' could not be located. Looked in: {search_locations}"
	)


	def _load_artifact(filename: str) -> Any:
	"""Load a pickled artifact using joblib with helpful error messaging."""
	path = _resolve_artifact_path(filename)
	return joblib.load(path)


	app = Flask(__name__, static_folder=".", template_folder=".")

	# --- Artifact Loading -------------------------------------------------------------------------
	try:
	tfidf_vectorizer = _load_artifact("tfidf_vectorizer.pkl")
	onehot_encoder = _load_artifact("onehot_encoder.pkl")
	label_encoder = _load_artifact("label_encoder.pkl")
	view_prediction_model = _load_artifact("view_prediction_model.pkl")
	age_prediction_model = _load_artifact("age_prediction_model.pkl")
	text_features_matrix = _load_artifact("text_features_matrix.pkl")
	article_mapping = _load_artifact("article_mapping.pkl")
	except FileNotFoundError as exc: # pragma: no cover - occurs only if artifacts missing.
	# Fail fast during startup so the issue can be resolved immediately.
	raise RuntimeError(
	"Required model artifacts are missing. Ensure Phase 1-2 outputs are saved before "
	"starting the server."
	) from exc


	if not isinstance(text_features_matrix, csr_matrix):
	# Convert any compatible sparse matrix to CSR format for efficient row slicing.
	text_features_matrix = csr_matrix(text_features_matrix)

	try:
	contents_dataframe = pd.read_csv(CONTENTS_CSV)
	except FileNotFoundError as exc:
	raise RuntimeError(
	f"Required contents dataset not found at {CONTENTS_CSV}."
	) from exc

	article_content_lookup: Dict[Any, str] = {}
	if "article_id" in contents_dataframe.columns and "content" in contents_dataframe.columns:
	article_content_lookup = {
	str(row.article_id): (row.content if isinstance(row.content, str) else "")
	for row in contents_dataframe.itertuples()
	}
	else: # pragma: no cover - dataset schema mismatch
	raise RuntimeError(
	"contents.csv must contain 'article_id' and 'content' columns."
	)


	_encoded_categories: List[str]
	try:
	categories_arr = getattr(onehot_encoder, "categories_", None)
	if categories_arr:
	_encoded_categories = sorted(str(cat) for cat in categories_arr[0])
	else:
	_encoded_categories = []
	except AttributeError: # pragma: no cover
	_encoded_categories = []


	def _ensure_dataframe(category: str) -> Any:
	"""Create a minimal DataFrame for category encoding.

	Falls back to a simple dictionary-based DataFrame when pandas is available,
	otherwise raises a clear ImportError with remediation guidance.
	"""
	if pd is None:
	raise ImportError(
	"pandas is required to prepare categorical inputs. Please install pandas or "
	"ensure the training environment's dependencies are mirrored here."
	)
	return pd.DataFrame({"category": [category]})


	def _lookup_article_metadata(index: int) -> Dict[str, Any]:
	"""Return metadata for a given row index in the text feature matrix.

	During Phase 2 the pipeline should persist either a pandas DataFrame or a
	dictionary-like mapping that provides `article_id` and `title` fields. This
	helper normalises the structure so the API can rely on consistent keys.
	"""
	if isinstance(article_mapping, dict):
	entry = article_mapping.get(index)
	if entry is None:
	entry = article_mapping.get(str(index))
	if isinstance(entry, dict):
	return {
	"id": entry.get("article_id") or entry.get("id"),
	"title": entry.get("title") or entry.get("article_title"),
	}
	if isinstance(entry, (list, tuple)) and entry:
	entry_seq = cast(Sequence[Any], entry)
	article_id = entry_seq[0]
	article_title = entry_seq[1] if len(entry_seq) > 1 else entry_seq[0]
	return {"id": article_id, "title": article_title}

	mapping_obj = cast(Any, article_mapping)
	if hasattr(mapping_obj, "iloc"):
	# Supports pandas DataFrame or Series-like objects with iloc.
	try:
	row = mapping_obj.iloc[index]
	except Exception: # pragma: no cover - defensive guard.
	row = None
	if row is not None:
	if hasattr(row, "to_dict"):
	row_dict = row.to_dict()
	return {
	"id": row_dict.get("article_id") or row_dict.get("id"),
	"title": row_dict.get("title") or row_dict.get("article_title"),
	}

	# Fallback: surface the index so downstream consumers can still show something.
	return {"id": int(index), "title": f"Article #{index}"}


	def _find_similar_articles(query_vector: csr_matrix, top_k: int = 5) -> List[Dict[str, Any]]:
	"""Return the top-k most similar articles for the provided query vector."""
	similarities = cosine_similarity(query_vector, text_features_matrix).ravel()

	# Sort indices by descending similarity.
	ranked_indices = np.argsort(similarities)[::-1]

	similar_articles: List[Dict[str, Any]] = []
	for idx in ranked_indices:
	score = float(similarities[idx])

	# Skip near-identical matches (useful if querying an existing article).
	if score >= 0.9999 and not similar_articles:
	continue

	metadata = _lookup_article_metadata(int(idx))
	metadata.update({"similarity": round(score, 4)})
	similar_articles.append(metadata)
	if len(similar_articles) >= top_k:
	break

	return similar_articles


	def _extract_first_sentence(text: str) -> str:
	"""Return the first sentence-like fragment from the provided text."""
	if not text:
	return ""
	cleaned = re.sub(r"\s+", " ", text).strip()
	if not cleaned:
	return ""
	sentence_endings = re.split(r"(?<=[.!?？!。])\s+", cleaned)
	for fragment in sentence_endings:
	fragment = fragment.strip()
	if fragment:
	return fragment
	return cleaned[:80]


	def generate_seo_suggestions(content: str) -> Dict[str, str]:
	"""Generate SEO title and description using Google Gemini."""
	safe_content = content or ""
	safe_content = re.sub(r"\s+", " ", safe_content).strip()

	prompt = (
	"You are a lead digital editor for a korean prestigious online media company that bridges in-depth analysis with current trends. "
	"Your mission is to craft an SEO title and description that are both intelligent and highly shareable. The goal is to highlight the article's most timely, newsworthy, and debate-sparking elements to maximize public interest and social engagement.\n\n"
	"Guidelines:\n"
	"1. 'title' (under 60 characters): Start with a topic tag in brackets (e.g., `[주제]`) that summarizes the core subject. Following the tag, frame the core topic as a compelling thesis or a provocative question. Connect it to a current conversation or a surprising trend to make it feel urgent and relevant today. It should make people think, 'This is an interesting take.'\n"
	"2. 'description' (under 150 characters, in Korean): Go beyond summary. Contextualize the article's importance. Explain why this topic matters now and what new perspective the article offers on a familiar issue. It should persuade readers that this article will give them a crucial viewpoint for today's conversations.\n"
	"3. Format: Respond strictly with a valid JSON object with 'title' and 'description' keys. Avoid generic phrases, clickbait, and anything that undermines the intellectual integrity of the brand.\n\n"
	f"Article Content:\n{safe_content}\n\n"
	"Return exactly: {\"title\": \"[<주제>] <생성된 제목>\", \"description\": \"<생성된 설명>\"}"
	)
	try:
	response = SEO_GENERATIVE_MODEL.generate_content(prompt)
	raw_text = getattr(response, "text", "") or ""

	if not raw_text and getattr(response, "candidates", None):
	collected_parts: List[str] = []
	for candidate in response.candidates: # type: ignore[attr-defined]
	candidate_content: Any = getattr(candidate, "content", None)
	parts = getattr(candidate_content, "parts", None) if candidate_content else None
	if parts:
	for part in parts:
	text_part = getattr(part, "text", None)
	if text_part:
	collected_parts.append(str(text_part))
	raw_text = " ".join(collected_parts)

	cleaned_text = raw_text.strip()
	if not cleaned_text:
	raise ValueError("SEO model returned an empty response")

	if cleaned_text.startswith("```"):
	cleaned_text = re.sub(r"^```(?:json)?", "", cleaned_text, flags=re.IGNORECASE).strip()
	cleaned_text = re.sub(r"```$", "", cleaned_text).strip()

	match = re.search(r"{.*}", cleaned_text, re.DOTALL)
	json_payload = match.group(0) if match else cleaned_text

	seo_json = json.loads(json_payload)
	suggested_title = str(seo_json.get("title", "")).strip()
	suggested_description = str(seo_json.get("description", "")).strip()

	if not suggested_title or not suggested_description:
	raise ValueError("SEO model response missing required fields")

	return {
	"suggested_title": suggested_title[:60],
	"suggested_description": suggested_description[:150],
	}
	except Exception as exc: # pragma: no cover - external API failures
	logger.error("SEO model (%s) generation failed: %s", SEO_MODEL_NAME, exc)
	fallback_title = _extract_first_sentence(safe_content) or safe_content[:60]
	fallback_description = safe_content[:150]
	return {
	"suggested_title": fallback_title,
	"suggested_description": fallback_description,
	}


	@app.route("/", methods=["GET"])
	def serve_index() -> Any:
	"""Serve the single-page frontend."""
	template_dir = app.template_folder or "."
	return send_from_directory(template_dir, "index.html")


	@app.route("/healthz", methods=["GET"])
	def healthcheck() -> Any:
	"""Simple health check endpoint."""
	return jsonify({"status": "ok"})


	@app.route("/categories", methods=["GET"])
	def list_categories() -> Any:
	"""Expose category options inferred from the fitted OneHotEncoder."""
	return jsonify({"categories": _encoded_categories})


	@app.route("/predict", methods=["POST"])
	def predict() -> Any:
	payload = request.get_json(silent=True) or {}

	required_fields = {"title", "content", "category"}
	missing = [field for field in required_fields if not payload.get(field)]
	if missing:
	return (
	jsonify({"error": f"Missing required fields: {', '.join(missing)}"}),
	400,
	)

	title: str = str(payload.get("title", "")).strip()
	content: str = str(payload.get("content", "")).strip()
	category: str = str(payload.get("category", "")).strip()

	combined_text = f"{title} {content}".strip()
	if not combined_text:
	return jsonify({"error": "Title and content cannot both be empty."}), 400

	text_vector = tfidf_vectorizer.transform([combined_text])

	try:
	category_frame = _ensure_dataframe(category)
	category_vector = onehot_encoder.transform(category_frame[["category"]])
	except Exception as exc:
	return jsonify({"error": f"Failed to encode category: {exc}"}), 400

	feature_vector = hstack([text_vector, category_vector])

	view_prediction = view_prediction_model.predict(feature_vector)[0]
	predicted_views = int(round(float(view_prediction)))

	age_prediction = age_prediction_model.predict(feature_vector)[0]
	predicted_age_index = int(age_prediction)

	similar_articles_raw = _find_similar_articles(text_vector, top_k=5)

	similar_articles: List[Dict[str, Any]] = []
	for article in similar_articles_raw:
	article_id = article.get("id")
	article_title = article.get("title")
	lookup_key = str(article_id) if article_id is not None else ""
	content_text = article_content_lookup.get(lookup_key, "")
	summary = content_text.strip()[:100]
	similar_articles.append(
	{
	"id": article_id,
	"title": article_title,
	"summary": summary,
	}
	)

	try:
	decoded_age_group = label_encoder.inverse_transform([predicted_age_index])[0]
	except Exception:
	decoded_age_group = str(predicted_age_index)

	seo_recommendation = generate_seo_suggestions(content)
	seo_simulation = {
	"current_state": {
	"issue": "메타 정보에 핵심 키워드가 부족하고 설명이 너무 길어 SERP에서 잘립니다.",
	"title": title[:70] or "제목이 입력되지 않았습니다.",
	"description": (content[:150] + ("..." if len(content) > 150 else "")) if content else "본문이 입력되지 않았습니다.",
	},
	"recommended_state": {
	"title": seo_recommendation["suggested_title"],
	"description": seo_recommendation["suggested_description"],
	},
	}

	response_payload = {
	"ai_prediction": {
	"predicted_views": predicted_views,
	"predicted_age_group": decoded_age_group,
	"similar_articles": similar_articles,
	},
	"seo_simulation": seo_simulation,
	}

	return jsonify(response_payload)


	@app.route("/generate-description", methods=["POST"])
	def generate_description() -> Any:
	payload = request.get_json(silent=True) or {}
	content_text = str(payload.get("content", ""))
	if not content_text.strip():
	return jsonify({"error": "Content is required to generate a description."}), 400

	suggestions = generate_seo_suggestions(content_text)
	return jsonify(
	{
	"title": suggestions.get("suggested_title", ""),
	"description": suggestions.get("suggested_description", ""),
	}
	)


	if __name__ == "__main__": # pragma: no cover - manual execution only.
	app.run(host="0.0.0.0", port=5000, debug=False)