FinGraph / src /utils /analyze_dates.py
dev-yuje's picture
fix: resolve ruff lint errors and set huggingface python version to 3.10 to fix audioop error
1ecde19
raw
history blame
4.62 kB
"""
analyze_dates.py โ€” ์ˆ˜์ง‘๋œ ๋‰ด์Šค ๊ธฐ์‚ฌ ๋ฐœํ–‰ ์ผ์ž ํŠธ๋ Œ๋“œ ๋ถ„์„ ๋ฐ ์ตœ์  ๊ฐฑ์‹  ์ฃผ๊ธฐ ๋„์ถœ ์Šคํฌ๋ฆฝํŠธ
===================================================================================
"""
import glob
import os
import platform
import matplotlib.pyplot as plt
import pandas as pd
def run_analysis():
# 1. ํ”„๋กœ์ ํŠธ ํด๋”์˜ ๋ชจ๋“  Articles_*.xlsx ๊ธฐ์‚ฌ ํŒŒ์ผ ๋กœ๋“œ
files = glob.glob("Articles_*.xlsx")
if not files:
print("โŒ ๋ถ„์„ํ•  Articles_*.xlsx ํŒŒ์ผ์ด ๋กœ์ปฌ ๋””๋ ‰ํ† ๋ฆฌ์— ์—†์Šต๋‹ˆ๋‹ค.")
return
print(f"๐Ÿ“‚ ๋ฐœ๊ฒฌ๋œ ๋‰ด์Šค ๊ธฐ์‚ฌ ํŒŒ์ผ ๋ชฉ๋ก: {files}")
# 2. ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ ๋ฐ ์ค‘๋ณต ์ œ๊ฑฐ
dfs = []
for f in files:
try:
df = pd.read_excel(f)
dfs.append(df)
except Exception as e:
print(f"โš ๏ธ {f} ๋กœ๋“œ ์‹คํŒจ: {e}")
if not dfs:
print("โŒ ์œ ํšจํ•œ ๊ธฐ์‚ฌ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
return
df_all = pd.concat(dfs, ignore_index=True)
df_all = df_all.drop_duplicates(subset=["url"]) # ๋™์ผ ๊ธฐ์‚ฌ ์ค‘๋ณต ์ œ๊ฑฐ
print(f"๐Ÿ“Š ๋ณ‘ํ•ฉ ์™„๋ฃŒ๋œ ๊ณ ์œ  AI ํ•€ํ…Œํฌ ๊ธฐ์‚ฌ ์ด๋Ÿ‰: {len(df_all)}๊ฑด")
# 3. ๋‚ ์งœ ํŒŒ์‹ฑ ๋ฐ ์ •๋ ฌ (๋‚ ์งœ ํฌ๋งท ํ‘œ์ค€ํ™”)
df_all["published_date"] = pd.to_datetime(df_all["published_date"], errors="coerce")
df_all = df_all.dropna(subset=["published_date"])
df_all = df_all.sort_values(by="published_date")
# ์ผ์ž๋งŒ ์ถ”์ถœํ•˜์—ฌ ์ง‘๊ณ„
df_all["date_only"] = df_all["published_date"].dt.date
date_counts = df_all.groupby("date_only").size().reset_index(name="count")
# 4. ๋ถ„์„ํ‘œ ํ„ฐ๋ฏธ๋„ ์ถœ๋ ฅ
print("\n" + "=" * 50)
print("๐Ÿ“… [์ผ์ž๋ณ„ AI ํ•€ํ…Œํฌ ๊ธฐ์‚ฌ ์ƒ์‚ฐ ํŠธ๋ Œ๋“œ ํ‘œ]")
print("=" * 50)
print(date_counts.to_string(index=False))
print("=" * 50)
# 5. ์ˆ˜ํ•™์  ๋ถ„์„ ๋ฐ ๊ถŒ์žฅ ์ฃผ๊ธฐ ์ถ”์ฒœ
total_days = (date_counts["date_only"].max() - date_counts["date_only"].min()).days + 1
total_articles = date_counts["count"].sum()
avg_daily = total_articles / max(total_days, 1)
print(f"โฑ๏ธ ๊ด€์ธก ๊ธฐ๊ฐ„: {total_days}์ผ ({date_counts['date_only'].min()} ~ {date_counts['date_only'].max()})")
print(f"๐Ÿ“ˆ ์ผํ‰๊ท  AI ํ•€ํ…Œํฌ ๋‰ด์Šค ์ƒ์‚ฐ๋Ÿ‰: {avg_daily:.2f}๊ฑด")
# ์ผํ‰๊ท  ๋ณผ๋ฅจ์— ๋”ฐ๋ฅธ ์ตœ์ ํ™” ์ž๋™ํ™” ์ฃผ๊ธฐ ์ถ”์ฒœ ์•Œ๊ณ ๋ฆฌ์ฆ˜
if avg_daily >= 10:
recommendation = "โœจ ๋งค์ผ 1ํšŒ ๊ฐฑ์‹  (ํ•˜๋ฃจ ๊ธฐ์‚ฌ ์ƒ์‚ฐ๋Ÿ‰์ด 10๊ฑด ์ด์ƒ์œผ๋กœ ๋งค์šฐ ๋งŽ์•„, ์‹ค์‹œ๊ฐ„ ํŠธ๋ Œ๋“œ ํฌ์ฐฉ์„ ์œ„ํ•ด ๋งค์ผ ์ƒˆ๋ฒฝ 1์‹œ ์ž๋™ํ™”๊ฐ€ ํ•„์ˆ˜์ ์ž…๋‹ˆ๋‹ค.)"
elif avg_daily >= 3:
recommendation = "โœจ 2~3์ผ์— 1ํšŒ ๊ฐฑ์‹  (๊ธฐ์‚ฌ๊ฐ€ 2~3์ผ ๋‹จ์œ„๋กœ ์ ๋‹นํžˆ ๋ชจ์˜€์„ ๋•Œ ๊ทธ๋ž˜ํ”„๋ฅผ ๋นŒ๋“œํ•˜๋Š” ๊ฒƒ์ด API ๋น„์šฉ ๋Œ€๋น„ ์ง€์‹ ๋ฐ€๋„ ์ƒ ๊ฐ€์žฅ ํšจ์œจ์ ์ž…๋‹ˆ๋‹ค.)"
else:
recommendation = "โœจ 5์ผ~1์ฃผ์— 1ํšŒ ๊ฐฑ์‹  (AI ํ•€ํ…Œํฌ ํ‹ˆ์ƒˆ ๋„๋ฉ”์ธ ํŠน์„ฑ์ƒ ์ผ์ผ ๋ฐœํ–‰๋Ÿ‰์ด 3๊ฑด ๋ฏธ๋งŒ์œผ๋กœ ํ˜‘์†Œํ•˜๋ฏ€๋กœ, 5์ผ ๊ฐ„๊ฒฉ์œผ๋กœ ๋ชฐ์•„์„œ ๊ฐฑ์‹ ํ•˜๋Š” ๊ฒƒ์ด ํ•ฉ๋ฆฌ์ ์ž…๋‹ˆ๋‹ค.)"
print("-" * 50)
print("๐Ÿ’ก [์ตœ์ ์˜ GraphRAG ์ž๋™ํ™” ์ฃผ๊ธฐ ์ œ์•ˆ]")
print(f" {recommendation}")
print("=" * 50 + "\n")
# 6. ์ฐจํŠธ ์‹œ๊ฐํ™” ๋ฐ ์ด๋ฏธ์ง€ ํŒŒ์ผ ์ €์žฅ
if platform.system() == "Darwin":
plt.rc("font", family="AppleGothic") # Mac ํ•œ๊ธ€ ํฐํŠธ ๊นจ์ง ๋ฐฉ์ง€
plt.rcParams["axes.unicode_minus"] = False
plt.figure(figsize=(10, 5))
bars = plt.bar(
date_counts["date_only"].astype(str),
date_counts["count"],
color="royalblue",
edgecolor="black",
alpha=0.85,
)
# ๋ง‰๋Œ€ ์œ„์— ์ˆซ์ž ํ‘œ์‹œ
for bar in bars:
height = bar.get_height()
plt.text(
bar.get_x() + bar.get_width() / 2.0,
height + 0.1,
f"{int(height)}๊ฑด",
ha="center",
va="bottom",
fontsize=10,
fontweight="bold",
)
plt.title("์ผ์ž๋ณ„ AI ํ•€ํ…Œํฌ ๋‰ด์Šค ์ƒ์‚ฐ ํŠธ๋ Œ๋“œ ๋ถ„์„", fontsize=15, pad=15, fontweight="bold")
plt.xlabel("๊ธฐ์‚ฌ ๋ฐœํ–‰ ์ผ์ž", fontsize=12)
plt.ylabel("์ƒ์‚ฐ ๊ฑด์ˆ˜", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.xticks(rotation=25)
plt.tight_layout()
# artifacts ํด๋” ์•„๋ž˜์— ๋ถ„์„ ๊ฒฐ๊ณผ๋ฌผ ์ฐจํŠธ ์ €์žฅ
os.makedirs("artifacts", exist_ok=True)
img_path = "artifacts/daily_trend_analysis.png"
plt.savefig(img_path, dpi=200)
print(f"๐Ÿ’พ ์‹œ๊ฐํ™” ๋ถ„์„ ์ฐจํŠธ ์ €์žฅ ์™„๋ฃŒ โžก๏ธ [์ ˆ๋Œ€๊ฒฝ๋กœ]: {os.path.abspath(img_path)}")
if __name__ == "__main__":
run_analysis()