File size: 4,622 Bytes
64ad66f 1ecde19 64ad66f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | """
analyze_dates.py โ ์์ง๋ ๋ด์ค ๊ธฐ์ฌ ๋ฐํ ์ผ์ ํธ๋ ๋ ๋ถ์ ๋ฐ ์ต์ ๊ฐฑ์ ์ฃผ๊ธฐ ๋์ถ ์คํฌ๋ฆฝํธ
===================================================================================
"""
import glob
import os
import platform
import matplotlib.pyplot as plt
import pandas as pd
def run_analysis():
# 1. ํ๋ก์ ํธ ํด๋์ ๋ชจ๋ Articles_*.xlsx ๊ธฐ์ฌ ํ์ผ ๋ก๋
files = glob.glob("Articles_*.xlsx")
if not files:
print("โ ๋ถ์ํ Articles_*.xlsx ํ์ผ์ด ๋ก์ปฌ ๋๋ ํ ๋ฆฌ์ ์์ต๋๋ค.")
return
print(f"๐ ๋ฐ๊ฒฌ๋ ๋ด์ค ๊ธฐ์ฌ ํ์ผ ๋ชฉ๋ก: {files}")
# 2. ๋ฐ์ดํฐ ๋ณํฉ ๋ฐ ์ค๋ณต ์ ๊ฑฐ
dfs = []
for f in files:
try:
df = pd.read_excel(f)
dfs.append(df)
except Exception as e:
print(f"โ ๏ธ {f} ๋ก๋ ์คํจ: {e}")
if not dfs:
print("โ ์ ํจํ ๊ธฐ์ฌ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
return
df_all = pd.concat(dfs, ignore_index=True)
df_all = df_all.drop_duplicates(subset=["url"]) # ๋์ผ ๊ธฐ์ฌ ์ค๋ณต ์ ๊ฑฐ
print(f"๐ ๋ณํฉ ์๋ฃ๋ ๊ณ ์ AI ํํ
ํฌ ๊ธฐ์ฌ ์ด๋: {len(df_all)}๊ฑด")
# 3. ๋ ์ง ํ์ฑ ๋ฐ ์ ๋ ฌ (๋ ์ง ํฌ๋งท ํ์คํ)
df_all["published_date"] = pd.to_datetime(df_all["published_date"], errors="coerce")
df_all = df_all.dropna(subset=["published_date"])
df_all = df_all.sort_values(by="published_date")
# ์ผ์๋ง ์ถ์ถํ์ฌ ์ง๊ณ
df_all["date_only"] = df_all["published_date"].dt.date
date_counts = df_all.groupby("date_only").size().reset_index(name="count")
# 4. ๋ถ์ํ ํฐ๋ฏธ๋ ์ถ๋ ฅ
print("\n" + "=" * 50)
print("๐
[์ผ์๋ณ AI ํํ
ํฌ ๊ธฐ์ฌ ์์ฐ ํธ๋ ๋ ํ]")
print("=" * 50)
print(date_counts.to_string(index=False))
print("=" * 50)
# 5. ์ํ์ ๋ถ์ ๋ฐ ๊ถ์ฅ ์ฃผ๊ธฐ ์ถ์ฒ
total_days = (date_counts["date_only"].max() - date_counts["date_only"].min()).days + 1
total_articles = date_counts["count"].sum()
avg_daily = total_articles / max(total_days, 1)
print(f"โฑ๏ธ ๊ด์ธก ๊ธฐ๊ฐ: {total_days}์ผ ({date_counts['date_only'].min()} ~ {date_counts['date_only'].max()})")
print(f"๐ ์ผํ๊ท AI ํํ
ํฌ ๋ด์ค ์์ฐ๋: {avg_daily:.2f}๊ฑด")
# ์ผํ๊ท ๋ณผ๋ฅจ์ ๋ฐ๋ฅธ ์ต์ ํ ์๋ํ ์ฃผ๊ธฐ ์ถ์ฒ ์๊ณ ๋ฆฌ์ฆ
if avg_daily >= 10:
recommendation = "โจ ๋งค์ผ 1ํ ๊ฐฑ์ (ํ๋ฃจ ๊ธฐ์ฌ ์์ฐ๋์ด 10๊ฑด ์ด์์ผ๋ก ๋งค์ฐ ๋ง์, ์ค์๊ฐ ํธ๋ ๋ ํฌ์ฐฉ์ ์ํด ๋งค์ผ ์๋ฒฝ 1์ ์๋ํ๊ฐ ํ์์ ์
๋๋ค.)"
elif avg_daily >= 3:
recommendation = "โจ 2~3์ผ์ 1ํ ๊ฐฑ์ (๊ธฐ์ฌ๊ฐ 2~3์ผ ๋จ์๋ก ์ ๋นํ ๋ชจ์์ ๋ ๊ทธ๋ํ๋ฅผ ๋น๋ํ๋ ๊ฒ์ด API ๋น์ฉ ๋๋น ์ง์ ๋ฐ๋ ์ ๊ฐ์ฅ ํจ์จ์ ์
๋๋ค.)"
else:
recommendation = "โจ 5์ผ~1์ฃผ์ 1ํ ๊ฐฑ์ (AI ํํ
ํฌ ํ์ ๋๋ฉ์ธ ํน์ฑ์ ์ผ์ผ ๋ฐํ๋์ด 3๊ฑด ๋ฏธ๋ง์ผ๋ก ํ์ํ๋ฏ๋ก, 5์ผ ๊ฐ๊ฒฉ์ผ๋ก ๋ชฐ์์ ๊ฐฑ์ ํ๋ ๊ฒ์ด ํฉ๋ฆฌ์ ์
๋๋ค.)"
print("-" * 50)
print("๐ก [์ต์ ์ GraphRAG ์๋ํ ์ฃผ๊ธฐ ์ ์]")
print(f" {recommendation}")
print("=" * 50 + "\n")
# 6. ์ฐจํธ ์๊ฐํ ๋ฐ ์ด๋ฏธ์ง ํ์ผ ์ ์ฅ
if platform.system() == "Darwin":
plt.rc("font", family="AppleGothic") # Mac ํ๊ธ ํฐํธ ๊นจ์ง ๋ฐฉ์ง
plt.rcParams["axes.unicode_minus"] = False
plt.figure(figsize=(10, 5))
bars = plt.bar(
date_counts["date_only"].astype(str),
date_counts["count"],
color="royalblue",
edgecolor="black",
alpha=0.85,
)
# ๋ง๋ ์์ ์ซ์ ํ์
for bar in bars:
height = bar.get_height()
plt.text(
bar.get_x() + bar.get_width() / 2.0,
height + 0.1,
f"{int(height)}๊ฑด",
ha="center",
va="bottom",
fontsize=10,
fontweight="bold",
)
plt.title("์ผ์๋ณ AI ํํ
ํฌ ๋ด์ค ์์ฐ ํธ๋ ๋ ๋ถ์", fontsize=15, pad=15, fontweight="bold")
plt.xlabel("๊ธฐ์ฌ ๋ฐํ ์ผ์", fontsize=12)
plt.ylabel("์์ฐ ๊ฑด์", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.xticks(rotation=25)
plt.tight_layout()
# artifacts ํด๋ ์๋์ ๋ถ์ ๊ฒฐ๊ณผ๋ฌผ ์ฐจํธ ์ ์ฅ
os.makedirs("artifacts", exist_ok=True)
img_path = "artifacts/daily_trend_analysis.png"
plt.savefig(img_path, dpi=200)
print(f"๐พ ์๊ฐํ ๋ถ์ ์ฐจํธ ์ ์ฅ ์๋ฃ โก๏ธ [์ ๋๊ฒฝ๋ก]: {os.path.abspath(img_path)}")
if __name__ == "__main__":
run_analysis()
|