Broadcast_paper / app.py
Choi jun hyeok
update prompt
be91dcc
"""Flask backend for the "์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก" article performance prediction web app.
This server exposes prediction and metadata endpoints that rely on the
pre-trained artifacts produced during the offline training pipeline.
"""
from __future__ import annotations
import json
import logging
import os
import re
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast
from flask import Flask, jsonify, request, send_from_directory # type: ignore[import]
from konlpy.tag import Okt
import joblib # type: ignore[import]
import numpy as np
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics.pairwise import cosine_similarity # type: ignore[import]
from dotenv import load_dotenv
import google.generativeai as genai
# Optional dependency: pandas is only required for category input handling.
try:
import pandas as pd
except ImportError: # pragma: no cover - pandas should be available, but we guard just in case.
pd = None # type: ignore
if pd is None:
raise RuntimeError(
"pandas is required for this application. Please install pandas in the runtime environment."
)
load_dotenv()
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
raise RuntimeError("GEMINI_API_KEY is not set. Please define it in your .env file.")
genai.configure(api_key=API_KEY) # type: ignore[attr-defined]
SEO_MODEL_NAME = "gemma-3-27b-it"
SEO_GENERATIVE_MODEL = genai.GenerativeModel(SEO_MODEL_NAME) # type: ignore[attr-defined]
BASE_DIR = Path(__file__).resolve().parent
ARTIFACT_DIR = BASE_DIR / "artifacts"
DATA_DIR = BASE_DIR / "data_csv"
CONTENTS_CSV = DATA_DIR / "contents.csv"
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Okt ๊ฐ์ฒด๋ฅผ ์ „์—ญ์ ์œผ๋กœ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
OKT = Okt()
def okt_tokenizer(text: str) -> list[str]:
"""
Tokenize text using Okt to extract nouns and verbs.
This function must be defined in the same way as in the training script
for the TfidfVectorizer to be loaded correctly.
"""
if not isinstance(text, str) or not text.strip():
return []
# `stem=True`๋Š” ๋‹จ์–ด๋ฅผ ์›ํ˜•์œผ๋กœ ๋ณต์›ํ•ด์ฃผ๋Š” ์˜ต์…˜์ž…๋‹ˆ๋‹ค (e.g., '๋‹ฌ๋ ธ๋‹ค' -> '๋‹ฌ๋ฆฌ๋‹ค').
# ํ›ˆ๋ จ ์Šคํฌ๋ฆฝํŠธ์™€ ๋™์ผํ•˜๊ฒŒ ์œ ์ง€ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
return [
word
for word, tag in OKT.pos(text, stem=True)
if tag in ["Noun", "Verb"]
]
# Ensure unpickling finds the tokenizer when joblib loads artifacts created with the
# same function defined at module scope in the training script.
import __main__ as _main # noqa: E402 (import placed here intentionally)
setattr(_main, "okt_tokenizer", okt_tokenizer)
def _resolve_artifact_path(filename: str) -> Path:
"""Return the most likely path for a persisted artifact.
The training pipeline may save artifacts either in the project root or in an
`artifacts/` sub-directory. We attempt both locations for convenience and to
provide a clear error if the file cannot be found.
"""
direct_path = BASE_DIR / filename
if direct_path.exists():
return direct_path
artifacts_path = ARTIFACT_DIR / filename
if artifacts_path.exists():
return artifacts_path
search_locations = [str(direct_path), str(artifacts_path)]
raise FileNotFoundError(
f"Artifact '{filename}' could not be located. Looked in: {search_locations}"
)
def _load_artifact(filename: str) -> Any:
"""Load a pickled artifact using joblib with helpful error messaging."""
path = _resolve_artifact_path(filename)
return joblib.load(path)
app = Flask(__name__, static_folder=".", template_folder=".")
# --- Artifact Loading -------------------------------------------------------------------------
try:
tfidf_vectorizer = _load_artifact("tfidf_vectorizer.pkl")
onehot_encoder = _load_artifact("onehot_encoder.pkl")
label_encoder = _load_artifact("label_encoder.pkl")
view_prediction_model = _load_artifact("view_prediction_model.pkl")
age_prediction_model = _load_artifact("age_prediction_model.pkl")
text_features_matrix = _load_artifact("text_features_matrix.pkl")
article_mapping = _load_artifact("article_mapping.pkl")
except FileNotFoundError as exc: # pragma: no cover - occurs only if artifacts missing.
# Fail fast during startup so the issue can be resolved immediately.
raise RuntimeError(
"Required model artifacts are missing. Ensure Phase 1-2 outputs are saved before "
"starting the server."
) from exc
if not isinstance(text_features_matrix, csr_matrix):
# Convert any compatible sparse matrix to CSR format for efficient row slicing.
text_features_matrix = csr_matrix(text_features_matrix)
try:
contents_dataframe = pd.read_csv(CONTENTS_CSV)
except FileNotFoundError as exc:
raise RuntimeError(
f"Required contents dataset not found at {CONTENTS_CSV}."
) from exc
article_content_lookup: Dict[Any, str] = {}
if "article_id" in contents_dataframe.columns and "content" in contents_dataframe.columns:
article_content_lookup = {
str(row.article_id): (row.content if isinstance(row.content, str) else "")
for row in contents_dataframe.itertuples()
}
else: # pragma: no cover - dataset schema mismatch
raise RuntimeError(
"contents.csv must contain 'article_id' and 'content' columns."
)
_encoded_categories: List[str]
try:
categories_arr = getattr(onehot_encoder, "categories_", None)
if categories_arr:
_encoded_categories = sorted(str(cat) for cat in categories_arr[0])
else:
_encoded_categories = []
except AttributeError: # pragma: no cover
_encoded_categories = []
def _ensure_dataframe(category: str) -> Any:
"""Create a minimal DataFrame for category encoding.
Falls back to a simple dictionary-based DataFrame when pandas is available,
otherwise raises a clear ImportError with remediation guidance.
"""
if pd is None:
raise ImportError(
"pandas is required to prepare categorical inputs. Please install pandas or "
"ensure the training environment's dependencies are mirrored here."
)
return pd.DataFrame({"category": [category]})
def _lookup_article_metadata(index: int) -> Dict[str, Any]:
"""Return metadata for a given row index in the text feature matrix.
During Phase 2 the pipeline should persist either a pandas DataFrame or a
dictionary-like mapping that provides `article_id` and `title` fields. This
helper normalises the structure so the API can rely on consistent keys.
"""
if isinstance(article_mapping, dict):
entry = article_mapping.get(index)
if entry is None:
entry = article_mapping.get(str(index))
if isinstance(entry, dict):
return {
"id": entry.get("article_id") or entry.get("id"),
"title": entry.get("title") or entry.get("article_title"),
}
if isinstance(entry, (list, tuple)) and entry:
entry_seq = cast(Sequence[Any], entry)
article_id = entry_seq[0]
article_title = entry_seq[1] if len(entry_seq) > 1 else entry_seq[0]
return {"id": article_id, "title": article_title}
mapping_obj = cast(Any, article_mapping)
if hasattr(mapping_obj, "iloc"):
# Supports pandas DataFrame or Series-like objects with iloc.
try:
row = mapping_obj.iloc[index]
except Exception: # pragma: no cover - defensive guard.
row = None
if row is not None:
if hasattr(row, "to_dict"):
row_dict = row.to_dict()
return {
"id": row_dict.get("article_id") or row_dict.get("id"),
"title": row_dict.get("title") or row_dict.get("article_title"),
}
# Fallback: surface the index so downstream consumers can still show something.
return {"id": int(index), "title": f"Article #{index}"}
def _find_similar_articles(query_vector: csr_matrix, top_k: int = 5) -> List[Dict[str, Any]]:
"""Return the top-k most similar articles for the provided query vector."""
similarities = cosine_similarity(query_vector, text_features_matrix).ravel()
# Sort indices by descending similarity.
ranked_indices = np.argsort(similarities)[::-1]
similar_articles: List[Dict[str, Any]] = []
for idx in ranked_indices:
score = float(similarities[idx])
# Skip near-identical matches (useful if querying an existing article).
if score >= 0.9999 and not similar_articles:
continue
metadata = _lookup_article_metadata(int(idx))
metadata.update({"similarity": round(score, 4)})
similar_articles.append(metadata)
if len(similar_articles) >= top_k:
break
return similar_articles
def _extract_first_sentence(text: str) -> str:
"""Return the first sentence-like fragment from the provided text."""
if not text:
return ""
cleaned = re.sub(r"\s+", " ", text).strip()
if not cleaned:
return ""
sentence_endings = re.split(r"(?<=[.!?๏ผŸ!ใ€‚])\s+", cleaned)
for fragment in sentence_endings:
fragment = fragment.strip()
if fragment:
return fragment
return cleaned[:80]
def generate_seo_suggestions(content: str) -> Dict[str, str]:
"""Generate SEO title and description using Google Gemini."""
safe_content = content or ""
safe_content = re.sub(r"\s+", " ", safe_content).strip()
prompt = (
"You are a lead digital editor for a korean prestigious online media company that bridges in-depth analysis with current trends. "
"Your mission is to craft an SEO title and description that are both intelligent and highly shareable. The goal is to highlight the article's most timely, newsworthy, and debate-sparking elements to maximize public interest and social engagement.\n\n"
"Guidelines:\n"
"1. **'title' (under 60 characters):** **Start with a topic tag in brackets (e.g., `[์ฃผ์ œ]`)** that summarizes the core subject. Following the tag, frame the core topic as a compelling thesis or a provocative question. Connect it to a current conversation or a surprising trend to make it feel urgent and relevant *today*. It should make people think, 'This is an interesting take.'\n"
"2. **'description' (under 150 characters, in Korean):** Go beyond summary. Contextualize the article's importance. Explain *why* this topic matters *now* and what new perspective the article offers on a familiar issue. It should persuade readers that this article will give them a crucial viewpoint for today's conversations.\n"
"3. **Format:** Respond strictly with a valid JSON object with 'title' and 'description' keys. Avoid generic phrases, clickbait, and anything that undermines the intellectual integrity of the brand.\n\n"
f"Article Content:\n{safe_content}\n\n"
"Return exactly: {\"title\": \"[<์ฃผ์ œ>] <์ƒ์„ฑ๋œ ์ œ๋ชฉ>\", \"description\": \"<์ƒ์„ฑ๋œ ์„ค๋ช…>\"}"
)
try:
response = SEO_GENERATIVE_MODEL.generate_content(prompt)
raw_text = getattr(response, "text", "") or ""
if not raw_text and getattr(response, "candidates", None):
collected_parts: List[str] = []
for candidate in response.candidates: # type: ignore[attr-defined]
candidate_content: Any = getattr(candidate, "content", None)
parts = getattr(candidate_content, "parts", None) if candidate_content else None
if parts:
for part in parts:
text_part = getattr(part, "text", None)
if text_part:
collected_parts.append(str(text_part))
raw_text = " ".join(collected_parts)
cleaned_text = raw_text.strip()
if not cleaned_text:
raise ValueError("SEO model returned an empty response")
if cleaned_text.startswith("```"):
cleaned_text = re.sub(r"^```(?:json)?", "", cleaned_text, flags=re.IGNORECASE).strip()
cleaned_text = re.sub(r"```$", "", cleaned_text).strip()
match = re.search(r"{.*}", cleaned_text, re.DOTALL)
json_payload = match.group(0) if match else cleaned_text
seo_json = json.loads(json_payload)
suggested_title = str(seo_json.get("title", "")).strip()
suggested_description = str(seo_json.get("description", "")).strip()
if not suggested_title or not suggested_description:
raise ValueError("SEO model response missing required fields")
return {
"suggested_title": suggested_title[:60],
"suggested_description": suggested_description[:150],
}
except Exception as exc: # pragma: no cover - external API failures
logger.error("SEO model (%s) generation failed: %s", SEO_MODEL_NAME, exc)
fallback_title = _extract_first_sentence(safe_content) or safe_content[:60]
fallback_description = safe_content[:150]
return {
"suggested_title": fallback_title,
"suggested_description": fallback_description,
}
@app.route("/", methods=["GET"])
def serve_index() -> Any:
"""Serve the single-page frontend."""
template_dir = app.template_folder or "."
return send_from_directory(template_dir, "index.html")
@app.route("/healthz", methods=["GET"])
def healthcheck() -> Any:
"""Simple health check endpoint."""
return jsonify({"status": "ok"})
@app.route("/categories", methods=["GET"])
def list_categories() -> Any:
"""Expose category options inferred from the fitted OneHotEncoder."""
return jsonify({"categories": _encoded_categories})
@app.route("/predict", methods=["POST"])
def predict() -> Any:
payload = request.get_json(silent=True) or {}
required_fields = {"title", "content", "category"}
missing = [field for field in required_fields if not payload.get(field)]
if missing:
return (
jsonify({"error": f"Missing required fields: {', '.join(missing)}"}),
400,
)
title: str = str(payload.get("title", "")).strip()
content: str = str(payload.get("content", "")).strip()
category: str = str(payload.get("category", "")).strip()
combined_text = f"{title} {content}".strip()
if not combined_text:
return jsonify({"error": "Title and content cannot both be empty."}), 400
text_vector = tfidf_vectorizer.transform([combined_text])
try:
category_frame = _ensure_dataframe(category)
category_vector = onehot_encoder.transform(category_frame[["category"]])
except Exception as exc:
return jsonify({"error": f"Failed to encode category: {exc}"}), 400
feature_vector = hstack([text_vector, category_vector])
view_prediction = view_prediction_model.predict(feature_vector)[0]
predicted_views = int(round(float(view_prediction)))
age_prediction = age_prediction_model.predict(feature_vector)[0]
predicted_age_index = int(age_prediction)
similar_articles_raw = _find_similar_articles(text_vector, top_k=5)
similar_articles: List[Dict[str, Any]] = []
for article in similar_articles_raw:
article_id = article.get("id")
article_title = article.get("title")
lookup_key = str(article_id) if article_id is not None else ""
content_text = article_content_lookup.get(lookup_key, "")
summary = content_text.strip()[:100]
similar_articles.append(
{
"id": article_id,
"title": article_title,
"summary": summary,
}
)
try:
decoded_age_group = label_encoder.inverse_transform([predicted_age_index])[0]
except Exception:
decoded_age_group = str(predicted_age_index)
seo_recommendation = generate_seo_suggestions(content)
seo_simulation = {
"current_state": {
"issue": "๋ฉ”ํƒ€ ์ •๋ณด์— ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ๊ฐ€ ๋ถ€์กฑํ•˜๊ณ  ์„ค๋ช…์ด ๋„ˆ๋ฌด ๊ธธ์–ด SERP์—์„œ ์ž˜๋ฆฝ๋‹ˆ๋‹ค.",
"title": title[:70] or "์ œ๋ชฉ์ด ์ž…๋ ฅ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.",
"description": (content[:150] + ("..." if len(content) > 150 else "")) if content else "๋ณธ๋ฌธ์ด ์ž…๋ ฅ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.",
},
"recommended_state": {
"title": seo_recommendation["suggested_title"],
"description": seo_recommendation["suggested_description"],
},
}
response_payload = {
"ai_prediction": {
"predicted_views": predicted_views,
"predicted_age_group": decoded_age_group,
"similar_articles": similar_articles,
},
"seo_simulation": seo_simulation,
}
return jsonify(response_payload)
@app.route("/generate-description", methods=["POST"])
def generate_description() -> Any:
payload = request.get_json(silent=True) or {}
content_text = str(payload.get("content", ""))
if not content_text.strip():
return jsonify({"error": "Content is required to generate a description."}), 400
suggestions = generate_seo_suggestions(content_text)
return jsonify(
{
"title": suggestions.get("suggested_title", ""),
"description": suggestions.get("suggested_description", ""),
}
)
if __name__ == "__main__": # pragma: no cover - manual execution only.
app.run(host="0.0.0.0", port=5000, debug=False)