Spaces:
Sleeping
Sleeping
| """Flask backend for the "์ ๋ฌธ๊ณผ๋ฐฉ์ก" article performance prediction web app. | |
| This server exposes prediction and metadata endpoints that rely on the | |
| pre-trained artifacts produced during the offline training pipeline. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import os | |
| import re | |
| from pathlib import Path | |
| from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast | |
| from flask import Flask, jsonify, request, send_from_directory # type: ignore[import] | |
| from konlpy.tag import Okt | |
| import joblib # type: ignore[import] | |
| import numpy as np | |
| from scipy.sparse import csr_matrix, hstack | |
| from sklearn.metrics.pairwise import cosine_similarity # type: ignore[import] | |
| from dotenv import load_dotenv | |
| import google.generativeai as genai | |
| # Optional dependency: pandas is only required for category input handling. | |
| try: | |
| import pandas as pd | |
| except ImportError: # pragma: no cover - pandas should be available, but we guard just in case. | |
| pd = None # type: ignore | |
| if pd is None: | |
| raise RuntimeError( | |
| "pandas is required for this application. Please install pandas in the runtime environment." | |
| ) | |
| load_dotenv() | |
| API_KEY = os.getenv("GEMINI_API_KEY") | |
| if not API_KEY: | |
| raise RuntimeError("GEMINI_API_KEY is not set. Please define it in your .env file.") | |
| genai.configure(api_key=API_KEY) # type: ignore[attr-defined] | |
| SEO_MODEL_NAME = "gemma-3-27b-it" | |
| SEO_GENERATIVE_MODEL = genai.GenerativeModel(SEO_MODEL_NAME) # type: ignore[attr-defined] | |
| BASE_DIR = Path(__file__).resolve().parent | |
| ARTIFACT_DIR = BASE_DIR / "artifacts" | |
| DATA_DIR = BASE_DIR / "data_csv" | |
| CONTENTS_CSV = DATA_DIR / "contents.csv" | |
| logger = logging.getLogger(__name__) | |
| logging.basicConfig(level=logging.INFO) | |
| # Okt ๊ฐ์ฒด๋ฅผ ์ ์ญ์ ์ผ๋ก ์ด๊ธฐํํฉ๋๋ค. | |
| OKT = Okt() | |
| def okt_tokenizer(text: str) -> list[str]: | |
| """ | |
| Tokenize text using Okt to extract nouns and verbs. | |
| This function must be defined in the same way as in the training script | |
| for the TfidfVectorizer to be loaded correctly. | |
| """ | |
| if not isinstance(text, str) or not text.strip(): | |
| return [] | |
| # `stem=True`๋ ๋จ์ด๋ฅผ ์ํ์ผ๋ก ๋ณต์ํด์ฃผ๋ ์ต์ ์ ๋๋ค (e.g., '๋ฌ๋ ธ๋ค' -> '๋ฌ๋ฆฌ๋ค'). | |
| # ํ๋ จ ์คํฌ๋ฆฝํธ์ ๋์ผํ๊ฒ ์ ์งํด์ผ ํฉ๋๋ค. | |
| return [ | |
| word | |
| for word, tag in OKT.pos(text, stem=True) | |
| if tag in ["Noun", "Verb"] | |
| ] | |
| # Ensure unpickling finds the tokenizer when joblib loads artifacts created with the | |
| # same function defined at module scope in the training script. | |
| import __main__ as _main # noqa: E402 (import placed here intentionally) | |
| setattr(_main, "okt_tokenizer", okt_tokenizer) | |
| def _resolve_artifact_path(filename: str) -> Path: | |
| """Return the most likely path for a persisted artifact. | |
| The training pipeline may save artifacts either in the project root or in an | |
| `artifacts/` sub-directory. We attempt both locations for convenience and to | |
| provide a clear error if the file cannot be found. | |
| """ | |
| direct_path = BASE_DIR / filename | |
| if direct_path.exists(): | |
| return direct_path | |
| artifacts_path = ARTIFACT_DIR / filename | |
| if artifacts_path.exists(): | |
| return artifacts_path | |
| search_locations = [str(direct_path), str(artifacts_path)] | |
| raise FileNotFoundError( | |
| f"Artifact '{filename}' could not be located. Looked in: {search_locations}" | |
| ) | |
| def _load_artifact(filename: str) -> Any: | |
| """Load a pickled artifact using joblib with helpful error messaging.""" | |
| path = _resolve_artifact_path(filename) | |
| return joblib.load(path) | |
| app = Flask(__name__, static_folder=".", template_folder=".") | |
| # --- Artifact Loading ------------------------------------------------------------------------- | |
| try: | |
| tfidf_vectorizer = _load_artifact("tfidf_vectorizer.pkl") | |
| onehot_encoder = _load_artifact("onehot_encoder.pkl") | |
| label_encoder = _load_artifact("label_encoder.pkl") | |
| view_prediction_model = _load_artifact("view_prediction_model.pkl") | |
| age_prediction_model = _load_artifact("age_prediction_model.pkl") | |
| text_features_matrix = _load_artifact("text_features_matrix.pkl") | |
| article_mapping = _load_artifact("article_mapping.pkl") | |
| except FileNotFoundError as exc: # pragma: no cover - occurs only if artifacts missing. | |
| # Fail fast during startup so the issue can be resolved immediately. | |
| raise RuntimeError( | |
| "Required model artifacts are missing. Ensure Phase 1-2 outputs are saved before " | |
| "starting the server." | |
| ) from exc | |
| if not isinstance(text_features_matrix, csr_matrix): | |
| # Convert any compatible sparse matrix to CSR format for efficient row slicing. | |
| text_features_matrix = csr_matrix(text_features_matrix) | |
| try: | |
| contents_dataframe = pd.read_csv(CONTENTS_CSV) | |
| except FileNotFoundError as exc: | |
| raise RuntimeError( | |
| f"Required contents dataset not found at {CONTENTS_CSV}." | |
| ) from exc | |
| article_content_lookup: Dict[Any, str] = {} | |
| if "article_id" in contents_dataframe.columns and "content" in contents_dataframe.columns: | |
| article_content_lookup = { | |
| str(row.article_id): (row.content if isinstance(row.content, str) else "") | |
| for row in contents_dataframe.itertuples() | |
| } | |
| else: # pragma: no cover - dataset schema mismatch | |
| raise RuntimeError( | |
| "contents.csv must contain 'article_id' and 'content' columns." | |
| ) | |
| _encoded_categories: List[str] | |
| try: | |
| categories_arr = getattr(onehot_encoder, "categories_", None) | |
| if categories_arr: | |
| _encoded_categories = sorted(str(cat) for cat in categories_arr[0]) | |
| else: | |
| _encoded_categories = [] | |
| except AttributeError: # pragma: no cover | |
| _encoded_categories = [] | |
| def _ensure_dataframe(category: str) -> Any: | |
| """Create a minimal DataFrame for category encoding. | |
| Falls back to a simple dictionary-based DataFrame when pandas is available, | |
| otherwise raises a clear ImportError with remediation guidance. | |
| """ | |
| if pd is None: | |
| raise ImportError( | |
| "pandas is required to prepare categorical inputs. Please install pandas or " | |
| "ensure the training environment's dependencies are mirrored here." | |
| ) | |
| return pd.DataFrame({"category": [category]}) | |
| def _lookup_article_metadata(index: int) -> Dict[str, Any]: | |
| """Return metadata for a given row index in the text feature matrix. | |
| During Phase 2 the pipeline should persist either a pandas DataFrame or a | |
| dictionary-like mapping that provides `article_id` and `title` fields. This | |
| helper normalises the structure so the API can rely on consistent keys. | |
| """ | |
| if isinstance(article_mapping, dict): | |
| entry = article_mapping.get(index) | |
| if entry is None: | |
| entry = article_mapping.get(str(index)) | |
| if isinstance(entry, dict): | |
| return { | |
| "id": entry.get("article_id") or entry.get("id"), | |
| "title": entry.get("title") or entry.get("article_title"), | |
| } | |
| if isinstance(entry, (list, tuple)) and entry: | |
| entry_seq = cast(Sequence[Any], entry) | |
| article_id = entry_seq[0] | |
| article_title = entry_seq[1] if len(entry_seq) > 1 else entry_seq[0] | |
| return {"id": article_id, "title": article_title} | |
| mapping_obj = cast(Any, article_mapping) | |
| if hasattr(mapping_obj, "iloc"): | |
| # Supports pandas DataFrame or Series-like objects with iloc. | |
| try: | |
| row = mapping_obj.iloc[index] | |
| except Exception: # pragma: no cover - defensive guard. | |
| row = None | |
| if row is not None: | |
| if hasattr(row, "to_dict"): | |
| row_dict = row.to_dict() | |
| return { | |
| "id": row_dict.get("article_id") or row_dict.get("id"), | |
| "title": row_dict.get("title") or row_dict.get("article_title"), | |
| } | |
| # Fallback: surface the index so downstream consumers can still show something. | |
| return {"id": int(index), "title": f"Article #{index}"} | |
| def _find_similar_articles(query_vector: csr_matrix, top_k: int = 5) -> List[Dict[str, Any]]: | |
| """Return the top-k most similar articles for the provided query vector.""" | |
| similarities = cosine_similarity(query_vector, text_features_matrix).ravel() | |
| # Sort indices by descending similarity. | |
| ranked_indices = np.argsort(similarities)[::-1] | |
| similar_articles: List[Dict[str, Any]] = [] | |
| for idx in ranked_indices: | |
| score = float(similarities[idx]) | |
| # Skip near-identical matches (useful if querying an existing article). | |
| if score >= 0.9999 and not similar_articles: | |
| continue | |
| metadata = _lookup_article_metadata(int(idx)) | |
| metadata.update({"similarity": round(score, 4)}) | |
| similar_articles.append(metadata) | |
| if len(similar_articles) >= top_k: | |
| break | |
| return similar_articles | |
| def _extract_first_sentence(text: str) -> str: | |
| """Return the first sentence-like fragment from the provided text.""" | |
| if not text: | |
| return "" | |
| cleaned = re.sub(r"\s+", " ", text).strip() | |
| if not cleaned: | |
| return "" | |
| sentence_endings = re.split(r"(?<=[.!?๏ผ!ใ])\s+", cleaned) | |
| for fragment in sentence_endings: | |
| fragment = fragment.strip() | |
| if fragment: | |
| return fragment | |
| return cleaned[:80] | |
| def generate_seo_suggestions(content: str) -> Dict[str, str]: | |
| """Generate SEO title and description using Google Gemini.""" | |
| safe_content = content or "" | |
| safe_content = re.sub(r"\s+", " ", safe_content).strip() | |
| prompt = ( | |
| "You are a lead digital editor for a korean prestigious online media company that bridges in-depth analysis with current trends. " | |
| "Your mission is to craft an SEO title and description that are both intelligent and highly shareable. The goal is to highlight the article's most timely, newsworthy, and debate-sparking elements to maximize public interest and social engagement.\n\n" | |
| "Guidelines:\n" | |
| "1. **'title' (under 60 characters):** **Start with a topic tag in brackets (e.g., `[์ฃผ์ ]`)** that summarizes the core subject. Following the tag, frame the core topic as a compelling thesis or a provocative question. Connect it to a current conversation or a surprising trend to make it feel urgent and relevant *today*. It should make people think, 'This is an interesting take.'\n" | |
| "2. **'description' (under 150 characters, in Korean):** Go beyond summary. Contextualize the article's importance. Explain *why* this topic matters *now* and what new perspective the article offers on a familiar issue. It should persuade readers that this article will give them a crucial viewpoint for today's conversations.\n" | |
| "3. **Format:** Respond strictly with a valid JSON object with 'title' and 'description' keys. Avoid generic phrases, clickbait, and anything that undermines the intellectual integrity of the brand.\n\n" | |
| f"Article Content:\n{safe_content}\n\n" | |
| "Return exactly: {\"title\": \"[<์ฃผ์ >] <์์ฑ๋ ์ ๋ชฉ>\", \"description\": \"<์์ฑ๋ ์ค๋ช >\"}" | |
| ) | |
| try: | |
| response = SEO_GENERATIVE_MODEL.generate_content(prompt) | |
| raw_text = getattr(response, "text", "") or "" | |
| if not raw_text and getattr(response, "candidates", None): | |
| collected_parts: List[str] = [] | |
| for candidate in response.candidates: # type: ignore[attr-defined] | |
| candidate_content: Any = getattr(candidate, "content", None) | |
| parts = getattr(candidate_content, "parts", None) if candidate_content else None | |
| if parts: | |
| for part in parts: | |
| text_part = getattr(part, "text", None) | |
| if text_part: | |
| collected_parts.append(str(text_part)) | |
| raw_text = " ".join(collected_parts) | |
| cleaned_text = raw_text.strip() | |
| if not cleaned_text: | |
| raise ValueError("SEO model returned an empty response") | |
| if cleaned_text.startswith("```"): | |
| cleaned_text = re.sub(r"^```(?:json)?", "", cleaned_text, flags=re.IGNORECASE).strip() | |
| cleaned_text = re.sub(r"```$", "", cleaned_text).strip() | |
| match = re.search(r"{.*}", cleaned_text, re.DOTALL) | |
| json_payload = match.group(0) if match else cleaned_text | |
| seo_json = json.loads(json_payload) | |
| suggested_title = str(seo_json.get("title", "")).strip() | |
| suggested_description = str(seo_json.get("description", "")).strip() | |
| if not suggested_title or not suggested_description: | |
| raise ValueError("SEO model response missing required fields") | |
| return { | |
| "suggested_title": suggested_title[:60], | |
| "suggested_description": suggested_description[:150], | |
| } | |
| except Exception as exc: # pragma: no cover - external API failures | |
| logger.error("SEO model (%s) generation failed: %s", SEO_MODEL_NAME, exc) | |
| fallback_title = _extract_first_sentence(safe_content) or safe_content[:60] | |
| fallback_description = safe_content[:150] | |
| return { | |
| "suggested_title": fallback_title, | |
| "suggested_description": fallback_description, | |
| } | |
| def serve_index() -> Any: | |
| """Serve the single-page frontend.""" | |
| template_dir = app.template_folder or "." | |
| return send_from_directory(template_dir, "index.html") | |
| def healthcheck() -> Any: | |
| """Simple health check endpoint.""" | |
| return jsonify({"status": "ok"}) | |
| def list_categories() -> Any: | |
| """Expose category options inferred from the fitted OneHotEncoder.""" | |
| return jsonify({"categories": _encoded_categories}) | |
| def predict() -> Any: | |
| payload = request.get_json(silent=True) or {} | |
| required_fields = {"title", "content", "category"} | |
| missing = [field for field in required_fields if not payload.get(field)] | |
| if missing: | |
| return ( | |
| jsonify({"error": f"Missing required fields: {', '.join(missing)}"}), | |
| 400, | |
| ) | |
| title: str = str(payload.get("title", "")).strip() | |
| content: str = str(payload.get("content", "")).strip() | |
| category: str = str(payload.get("category", "")).strip() | |
| combined_text = f"{title} {content}".strip() | |
| if not combined_text: | |
| return jsonify({"error": "Title and content cannot both be empty."}), 400 | |
| text_vector = tfidf_vectorizer.transform([combined_text]) | |
| try: | |
| category_frame = _ensure_dataframe(category) | |
| category_vector = onehot_encoder.transform(category_frame[["category"]]) | |
| except Exception as exc: | |
| return jsonify({"error": f"Failed to encode category: {exc}"}), 400 | |
| feature_vector = hstack([text_vector, category_vector]) | |
| view_prediction = view_prediction_model.predict(feature_vector)[0] | |
| predicted_views = int(round(float(view_prediction))) | |
| age_prediction = age_prediction_model.predict(feature_vector)[0] | |
| predicted_age_index = int(age_prediction) | |
| similar_articles_raw = _find_similar_articles(text_vector, top_k=5) | |
| similar_articles: List[Dict[str, Any]] = [] | |
| for article in similar_articles_raw: | |
| article_id = article.get("id") | |
| article_title = article.get("title") | |
| lookup_key = str(article_id) if article_id is not None else "" | |
| content_text = article_content_lookup.get(lookup_key, "") | |
| summary = content_text.strip()[:100] | |
| similar_articles.append( | |
| { | |
| "id": article_id, | |
| "title": article_title, | |
| "summary": summary, | |
| } | |
| ) | |
| try: | |
| decoded_age_group = label_encoder.inverse_transform([predicted_age_index])[0] | |
| except Exception: | |
| decoded_age_group = str(predicted_age_index) | |
| seo_recommendation = generate_seo_suggestions(content) | |
| seo_simulation = { | |
| "current_state": { | |
| "issue": "๋ฉํ ์ ๋ณด์ ํต์ฌ ํค์๋๊ฐ ๋ถ์กฑํ๊ณ ์ค๋ช ์ด ๋๋ฌด ๊ธธ์ด SERP์์ ์๋ฆฝ๋๋ค.", | |
| "title": title[:70] or "์ ๋ชฉ์ด ์ ๋ ฅ๋์ง ์์์ต๋๋ค.", | |
| "description": (content[:150] + ("..." if len(content) > 150 else "")) if content else "๋ณธ๋ฌธ์ด ์ ๋ ฅ๋์ง ์์์ต๋๋ค.", | |
| }, | |
| "recommended_state": { | |
| "title": seo_recommendation["suggested_title"], | |
| "description": seo_recommendation["suggested_description"], | |
| }, | |
| } | |
| response_payload = { | |
| "ai_prediction": { | |
| "predicted_views": predicted_views, | |
| "predicted_age_group": decoded_age_group, | |
| "similar_articles": similar_articles, | |
| }, | |
| "seo_simulation": seo_simulation, | |
| } | |
| return jsonify(response_payload) | |
| def generate_description() -> Any: | |
| payload = request.get_json(silent=True) or {} | |
| content_text = str(payload.get("content", "")) | |
| if not content_text.strip(): | |
| return jsonify({"error": "Content is required to generate a description."}), 400 | |
| suggestions = generate_seo_suggestions(content_text) | |
| return jsonify( | |
| { | |
| "title": suggestions.get("suggested_title", ""), | |
| "description": suggestions.get("suggested_description", ""), | |
| } | |
| ) | |
| if __name__ == "__main__": # pragma: no cover - manual execution only. | |
| app.run(host="0.0.0.0", port=5000, debug=False) | |