tjrlgns09 commited on
Commit
d2100e7
ยท
1 Parent(s): a52c5c7
SQL_Example.txt CHANGED
@@ -20,4 +20,69 @@ CREATE INDEX idx_content_embedding ON t_test_textembedding USING hnsw (content_e
20
 
21
  --------------------------------
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
 
 
20
 
21
  --------------------------------
22
 
23
+ -- 1. pgvector ํ™•์žฅ์ด ์—†๋‹ค๋ฉด ๋จผ์ € ์ƒ์„ฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
24
+ CREATE EXTENSION IF NOT EXISTS vector;
25
+
26
+ -- 2. ํ…Œ์ด๋ธ” ์ƒ์„ฑ
27
+ CREATE TABLE t_test_imgembedding (
28
+ id BIGSERIAL PRIMARY KEY, -- PK (์ž๋™ ์ฆ๊ฐ€)
29
+ title VARCHAR ,
30
+ url VARCHAR ,
31
+ mimetype VARCHAR ,
32
+ img_embedding VECTOR(1280), -- ๋‚ด์šฉ ์ž„๋ฒ ๋”ฉ (768์ฐจ์›)
33
+ created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP -- ์ƒ์„ฑ์ผ
34
+ );
35
+
36
+ -- 3. (์„ ํƒ) ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ์„ฑ๋Šฅ์„ ๋†’์ด๊ธฐ ์œ„ํ•œ ์ธ๋ฑ์Šค ์ƒ์„ฑ (HNSW ์•Œ๊ณ ๋ฆฌ์ฆ˜, ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ธฐ์ค€)
37
+ CREATE INDEX idx_test_imgembedding ON t_test_imgembedding USING hnsw (img_embedding vector_cosine_ops);
38
+
39
+
40
+
41
+
42
+ ----------------------------------
43
+
44
+
45
+ // title_embedding_arr, content_embedding_arr๋Š”
46
+ // ONNX ๋ชจ๋ธ์—์„œ ์ถ”์ถœํ•œ 768๊ฐœ์˜ ์ˆซ์ž๊ฐ€ ๋‹ด๊ธด ๋ฐฐ์—ด(Array)์ž…๋‹ˆ๋‹ค.
47
+
48
+ let insertData = await db.query(
49
+ `
50
+ INSERT INTO t_test_textembedding (title, title_embedding, content, content_embedding)
51
+ VALUES ($1, $2, $3, $4)
52
+ RETURNING id, title, created_at;
53
+ `,
54
+ [
55
+ title,
56
+ JSON.stringify(title_embedding_arr), // DB ๋“œ๋ผ์ด๋ฒ„ ํ˜ธํ™˜์„ฑ์„ ์œ„ํ•ด ๋ฌธ์ž์—ด ํฌ๋งท '[...]' ์œผ๋กœ ๋ณ€ํ™˜
57
+ content,
58
+ JSON.stringify(content_embedding_arr)
59
+ ]
60
+ );
61
+
62
+ console.log('์ƒ์„ฑ๋œ ๋ฐ์ดํ„ฐ:', insertData.rows[0]);
63
+
64
+
65
+
66
+ -----------------------------------------
67
+
68
+
69
+ // query_embedding_arr๋Š” ์‚ฌ์šฉ์ž์˜ ๊ฒ€์ƒ‰์–ด๋ฅผ ONNX ๋ชจ๋ธ์— ๋Œ๋ ค ๋‚˜์˜จ ์ž„๋ฒ ๋”ฉ ๋ฐฐ์—ด์ž…๋‹ˆ๋‹ค.
70
+
71
+ let searchResult = await db.query(
72
+ `
73
+ SELECT
74
+ id,
75
+ title,
76
+ content,
77
+ -- ์ฝ”์‚ฌ์ธ ๊ฑฐ๋ฆฌ๋Š” 0์— ๊ฐ€๊นŒ์šธ์ˆ˜๋ก ์œ ์‚ฌํ•˜๋ฏ€๋กœ, ์ง๊ด€์ ์ธ '์œ ์‚ฌ๋„ ์ ์ˆ˜'๋ฅผ ์œ„ํ•ด 1์—์„œ ๋บ๋‹ˆ๋‹ค.
78
+ 1 - (content_embedding <=> $1) AS similarity_score
79
+ FROM t_test_textembedding
80
+ -- ์œ ์‚ฌ๋„ ์ž„๊ณ„๊ฐ’ ์„ค์ • (์˜ˆ: ์ •ํ™•๋„ ์ ์ˆ˜๊ฐ€ 0.5 ์ด์ƒ์ธ ๊ฒƒ๋งŒ)
81
+ WHERE 1 - (content_embedding <=> $1) > 0.5
82
+ ORDER BY content_embedding <=> $1 ASC
83
+ LIMIT 5;
84
+ `,
85
+ [JSON.stringify(query_embedding_arr)]
86
+ );
87
 
88
+ console.log('์œ ์‚ฌํ•œ ๋ฐ์ดํ„ฐ ๋ชฉ๋ก:', searchResult.rows);
app.py CHANGED
@@ -5,6 +5,8 @@ from contextlib import asynccontextmanager
5
 
6
  from router import llamindex_router
7
  from router import embedding_router
 
 
8
 
9
  @asynccontextmanager
10
  async def lifespan_manager(app: FastAPI):
@@ -12,9 +14,15 @@ async def lifespan_manager(app: FastAPI):
12
  ์„œ๋ฒ„ ์‹œ์ž‘ ์‹œ ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜๊ณ  ์ข…๋ฃŒ ์‹œ ์ •๋ฆฌํ•ฉ๋‹ˆ๋‹ค.
13
  """
14
 
 
 
 
15
  # ์„œ๋ฒ„๊ฐ€ ์š”์ฒญ ์ฒ˜๋ฆฌ๋ฅผ ์‹œ์ž‘ํ•˜๋„๋ก ์ œ์–ด๊ถŒ์„ ๋„˜๊ฒจ์ค๋‹ˆ๋‹ค.
16
  yield
17
 
 
 
 
18
  # FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ์ดˆ๊ธฐํ™”
19
  app = FastAPI(
20
  title="RAG+LLM",
@@ -33,6 +41,7 @@ app.add_middleware(
33
 
34
  app.include_router(llamindex_router.router, prefix="/llama_index")
35
  app.include_router(embedding_router.router, prefix="/embedding")
 
36
 
37
  # ํ—ฌ์Šค ์ฒดํฌ์šฉ ๊ธฐ๋ณธ ์—”๋“œํฌ์ธํŠธ
38
  @app.get("/", summary="API ํ—ฌ์Šค ์ฒดํฌ")
 
5
 
6
  from router import llamindex_router
7
  from router import embedding_router
8
+ from router import image_embedding_router
9
+ from core.cronjob import news_scheduler
10
 
11
  @asynccontextmanager
12
  async def lifespan_manager(app: FastAPI):
 
14
  ์„œ๋ฒ„ ์‹œ์ž‘ ์‹œ ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜๊ณ  ์ข…๋ฃŒ ์‹œ ์ •๋ฆฌํ•ฉ๋‹ˆ๋‹ค.
15
  """
16
 
17
+ # ์Šค์ผ€์ค„๋Ÿฌ ์‹œ์ž‘
18
+ #news_scheduler.start()
19
+
20
  # ์„œ๋ฒ„๊ฐ€ ์š”์ฒญ ์ฒ˜๋ฆฌ๋ฅผ ์‹œ์ž‘ํ•˜๋„๋ก ์ œ์–ด๊ถŒ์„ ๋„˜๊ฒจ์ค๋‹ˆ๋‹ค.
21
  yield
22
 
23
+ # ์Šค์ผ€์ค„๋Ÿฌ ์ข…๋ฃŒ
24
+ #news_scheduler.shutdown()
25
+
26
  # FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ์ดˆ๊ธฐํ™”
27
  app = FastAPI(
28
  title="RAG+LLM",
 
41
 
42
  app.include_router(llamindex_router.router, prefix="/llama_index")
43
  app.include_router(embedding_router.router, prefix="/embedding")
44
+ app.include_router(image_embedding_router.router, prefix="/image_embedding")
45
 
46
  # ํ—ฌ์Šค ์ฒดํฌ์šฉ ๊ธฐ๋ณธ ์—”๋“œํฌ์ธํŠธ
47
  @app.get("/", summary="API ํ—ฌ์Šค ์ฒดํฌ")
core/cronjob.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from apscheduler.schedulers.asyncio import AsyncIOScheduler
2
+ from apscheduler.triggers.cron import CronTrigger
3
+ from datetime import datetime
4
+ import urllib.parse
5
+ import feedparser
6
+ import asyncio
7
+ import pytz
8
+ from time import mktime
9
+
10
+ # ์ถ”๊ฐ€๋œ ์ž„ํฌํŠธ (DB ๋ฐ ๋ชจ๋ธ, ์ž„๋ฒ ๋”ฉ)
11
+ from core.database import SessionLocal
12
+ from core.models import NewsEmbedding
13
+ from core.dependencies import get_embedding_model
14
+ import os
15
+ import requests
16
+ from newspaper import Article
17
+ from langchain_google_genai import ChatGoogleGenerativeAI
18
+ from langchain_core.messages import HumanMessage
19
+ from apscheduler.triggers.interval import IntervalTrigger
20
+
21
+ # Initialize Gemini
22
+ llm = ChatGoogleGenerativeAI(
23
+ model="gemini-2.5-flash-lite",
24
+ temperature=0.1,
25
+ google_api_key=os.getenv("GOOGLE_API_KEY")
26
+ )
27
+
28
+ # (is_real_gold_news ํ•จ์ˆ˜๋Š” ๊ธฐ์กด๊ณผ ๋™์ผํ•˜๊ฒŒ ์œ ์ง€)
29
+ def is_real_gold_news(title):
30
+ title_lower = title.lower()
31
+ black_list = ["๊ธˆ์š”์ผ", "๋ณด์กฐ๊ธˆ", "์žฅํ•™๊ธˆ", "์ง€์›๊ธˆ", "๋ฒŒ๊ธˆ", "์ถœ๊ธˆ", "์ž…๊ธˆ", "๊ธˆ์ง€", "์†ก๊ธˆ", "๋Œ€์ถœ๊ธˆ", "๋ชจ๊ธˆ", "๊ธฐ๊ธˆ", "๊ณผ์ง•๊ธˆ", "golden retriever", "golden state", "golden globe", "golden rule", "marigold"]
32
+ white_list = ["์˜จ์Šค", "๊ณจ๋“œ๋ฐ”", "์‹œ์„ธ", "์ˆœ๊ธˆ", "๊ฑฐ๋ž˜์†Œ", "๋‹ฌ๋Ÿฌ", "ํˆฌ์ž", "๊ธˆ๊ฐ’", "ํ•œ๊ตญ๊ธˆ๊ฑฐ๋ž˜์†Œ", "krx", "๊ธˆํŽ€๋“œ", "ounce", "bullion", "price", "market", "fed", "inflation", "xau", "spot", "invest"]
33
+
34
+ if any(bad_word in title_lower for bad_word in black_list): return False
35
+ if any(good_word in title_lower for good_word in white_list): return True
36
+ return False
37
+
38
+ # 2. RSS ํ”ผ๋“œ๋ฅผ ๊ฐ€์ ธ์™€์„œ DB์— ์ €์žฅํ•˜๋Š” ํ•จ์ˆ˜๋กœ ์—…๊ทธ๋ ˆ์ด๋“œ
39
+ def fetch_filter_and_save_news(keyword, hl, gl, ceid, db_session, max_news=3):
40
+ url_keyword = urllib.parse.quote(keyword)
41
+ rss_url = f"https://news.google.com/rss/search?q={url_keyword}&hl={hl}&gl={gl}&ceid={ceid}"
42
+
43
+ feed = feedparser.parse(rss_url)
44
+ valid_news = []
45
+
46
+ # dependencies.py์— ์žˆ๋Š” ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ธ์Šคํ„ด์Šค ๊ฐ€์ ธ์˜ค๊ธฐ
47
+ embedder = get_embedding_model()
48
+
49
+ for entry in feed.entries:
50
+ if len(valid_news) >= max_news:
51
+ break
52
+
53
+ if is_real_gold_news(entry.title):
54
+ # ๐Ÿ’ก [ํ•ต์‹ฌ] ์ค‘๋ณต ๊ฒ€์‚ฌ: DB์— ๋˜‘๊ฐ™์€ ์ œ๋ชฉ์˜ ๊ธฐ์‚ฌ๊ฐ€ ์ด๋ฏธ ์žˆ๋Š”์ง€ ํ™•์ธ
55
+ exists = db_session.query(NewsEmbedding).filter(NewsEmbedding.title == entry.title).first()
56
+ if exists:
57
+ continue # ์ด๋ฏธ DB์— ์žˆ์œผ๋ฉด ์Šคํ‚ตํ•˜๊ณ  ๋‹ค์Œ ๊ธฐ์‚ฌ๋กœ ๋„˜์–ด๊ฐ
58
+
59
+ # ๐Ÿ’ก [ํ•ต์‹ฌ] ๋ฐœํ–‰์ผ ์ถ”์ถœ: ํ”ผ๋“œ์—์„œ ์ œ๊ณตํ•˜๋Š” ์‹œ๊ฐ„(published_parsed)์„ Datetime์œผ๋กœ ๋ณ€ํ™˜
60
+ if hasattr(entry, 'published_parsed') and entry.published_parsed:
61
+ pub_date = datetime.fromtimestamp(mktime(entry.published_parsed), pytz.UTC)
62
+ else:
63
+ pub_date = datetime.now(pytz.UTC)
64
+
65
+ # Gemini๋ฅผ ํ†ตํ•œ ์ง„์งœ ๊ธˆ ๋‰ด์Šค ํ™•์ธ ๋ฐ ์š”์•ฝ
66
+ try:
67
+ # ๐Ÿ’ก [๊ฐœ์„ ] ๋” ๊ฐ•๋ ฅํ•œ ๋ธŒ๋ผ์šฐ์ € ์œ„์žฅ ํ—ค๋”
68
+ headers = {
69
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
70
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
71
+ 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
72
+ 'Cache-Control': 'no-cache',
73
+ 'Pragma': 'no-cache',
74
+ }
75
+
76
+ # 1. ๋จผ์ € ๋ฆฌ๋‹ค์ด๋ ‰ํŠธ๋œ ์ตœ์ข… URL์„ ๋”ฐ๋ƒ…๋‹ˆ๋‹ค.
77
+ response = requests.get(entry.link, timeout=15, headers=headers, allow_redirects=True)
78
+ real_url = response.url
79
+
80
+ # 2. newspaper3k ์„ค์ • ์ ์šฉ
81
+ from newspaper import Config
82
+ config = Config()
83
+ config.browser_user_agent = headers['User-Agent']
84
+ config.request_timeout = 15
85
+
86
+ article = Article(real_url, config=config)
87
+ article.download()
88
+ article.parse()
89
+
90
+ article_text = article.text.strip()
91
+
92
+ # ๐Ÿ’ก [๊ฒ€์ฆ] ๋ณธ๋ฌธ์ด ์—†๊ฑฐ๋‚˜ "Google News" ๊ป๋ฐ๊ธฐ๋งŒ ๊ธํžŒ ๊ฒฝ์šฐ ์ฒดํฌ
93
+ if len(article_text) < 100 or "Google News" in article_text[:100]:
94
+ # ๋งŒ์•ฝ newspaper๊ฐ€ ์‹คํŒจํ•˜๋ฉด BeautifulSoup์œผ๋กœ ์žฌ์‹œ๋„ (์ตœํ›„์˜ ์ˆ˜๋‹จ)
95
+ soup = BeautifulSoup(response.text, 'html.parser')
96
+ # ๋‰ด์Šค ์‚ฌ์ดํŠธ๋“ค์ด ๋ณดํ†ต ์“ฐ๋Š” ๋ณธ๋ฌธ ํƒœ๊ทธ๋“ค ์œ„์ฃผ๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ
97
+ article_text = ' '.join([p.text for p in soup.find_all('p') if len(p.text) > 20])
98
+
99
+ if len(article_text) < 100:
100
+ print(f"โš ๏ธ ๋ณธ๋ฌธ ์ถ”์ถœ ์‹คํŒจ (๋‚ด์šฉ ๋ถ€์กฑ): {entry.title}")
101
+ continue
102
+
103
+ # ๋””๋ฒ„๊น… ์ถœ๋ ฅ
104
+ print(f"\n[๊ธฐ์‚ฌ ์ œ๋ชฉ]: {entry.title}")
105
+ print(f"[์‹ค์ œ ์ฃผ์†Œ]: {real_url}")
106
+ print(f"[๋ณธ๋ฌธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ]:\n{article_text[:300]}...\n")
107
+ print("-" * 50)
108
+
109
+ # ํ…Œ์ŠคํŠธ ์™„๋ฃŒ ์ „๊นŒ์ง€๋Š” API ํ˜ธ์ถœ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด ์œ ์ง€
110
+ continue
111
+
112
+ prompt = f"""
113
+ ๋‹ค์Œ์€ ๋‰ด์Šค ๊ธฐ์‚ฌ ์›๋ฌธ์ž…๋‹ˆ๋‹ค:
114
+ {article_text[:3000]}
115
+
116
+ ์ด ๋‰ด์Šค๊ฐ€ ๊ธˆ(Gold, ๊ท€๊ธˆ์†/ํˆฌ์ž์ž์‚ฐ/๊ธˆ๊ฐ’)๊ณผ ๊ด€๋ จ๋œ ์‹ค์ œ ๋‰ด์Šค์ธ์ง€ ํŒ๋ณ„ํ•˜๊ณ ,
117
+ ๋งž๋‹ค๋ฉด ๊ธฐ์‚ฌ์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ 1~2์ค„๋กœ ์š”์•ฝํ•ด์ฃผ์„ธ์š”.
118
+ ๋งŒ์•ฝ ๊ธˆ๊ณผ ์ „ํ˜€ ๊ด€๋ จ์ด ์—†๋Š” ๋‰ด์Šค๋ผ๋ฉด (์˜ˆ: ๊ธˆ์š”์ผ, ๋ฒŒ๊ธˆ, ์žฅํ•™๊ธˆ, ์ถœ๊ธˆ, ์†ก๊ธˆ, ๋ณด์กฐ๊ธˆ, ๋‹จ์ˆœํžˆ '๊ธˆ'์ด ํฌํ•จ๋œ ๋‹จ์–ด๋งŒ ์žˆ๋Š” ๊ธฐ์‚ฌ ๋“ฑ)
119
+ 'NOT_GOLD_NEWS' ๋ผ๊ณ ๋งŒ ์ •ํ™•ํžˆ ๋‹ต๋ณ€ํ•˜์„ธ์š”.
120
+
121
+ ์š”์•ฝ ๊ฒฐ๊ณผ:
122
+ """
123
+ response = llm.invoke([HumanMessage(content=prompt)])
124
+ summary = response.content.strip()
125
+
126
+ if summary == "NOT_GOLD_NEWS":
127
+ print(f"โŒ [Gemini ํ•„ํ„ฐ๋ง] ๊ฐ€์งœ ๊ธˆ ๋‰ด์Šค ์Šคํ‚ต: {entry.title}")
128
+ continue
129
+
130
+ content_text = summary
131
+ print(f"โœ… [Gemini ์š”์•ฝ ์„ฑ๊ณต] ์š”์•ฝ๋ฌธ: {content_text}")
132
+
133
+ except Exception as e:
134
+ print(f"โš ๏ธ ๊ธฐ์‚ฌ ๋ณธ๋ฌธ ์ถ”์ถœ ๋˜๋Š” ์š”์•ฝ ์‹คํŒจ ({entry.title}): {e}")
135
+ # ์‹คํŒจํ–ˆ์„ ๋• ๋ฌด์‹œํ•˜๊ณ  ๋‹ค์Œ ๊ธฐ์‚ฌ๋กœ ๋„˜์–ด๊ฐ‘๋‹ˆ๋‹ค
136
+ continue
137
+
138
+ # ๐Ÿ’ก ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ (List[float] ํ˜•ํƒœ๋กœ ๋ฐ˜ํ™˜๋จ)
139
+ # content ์ปฌ๋Ÿผ์— ๋„ฃ์„ ๋ฐ์ดํ„ฐ ๊ตฌ์„ฑ (๋‰ด์Šค ์š”์•ฝ)
140
+
141
+ title_emb = embedder.embed_query(entry.title)
142
+ content_emb = embedder.embed_query(content_text)
143
+
144
+ # DB ๋ชจ๋ธ ๊ฐ์ฒด ์ƒ์„ฑ ๋ฐ ์„ธ์…˜์— ์ถ”๊ฐ€
145
+ new_article = NewsEmbedding(
146
+ title=entry.title,
147
+ title_embedding=title_emb,
148
+ content=content_text,
149
+ content_embedding=content_emb,
150
+ created_at=pub_date
151
+ )
152
+ db_session.add(new_article)
153
+
154
+ valid_news.append((entry.title, entry.link))
155
+ print(f"โœ… DB ์ถ”๊ฐ€ ์˜ˆ์•ฝ: {entry.title}")
156
+
157
+ # ๋ณ€๊ฒฝ์‚ฌํ•ญ์„ DB์— ์ตœ์ข… ๋ฐ˜์˜ (Commit)
158
+ if valid_news:
159
+ db_session.commit()
160
+
161
+ return valid_news
162
+
163
+ # 3. ๋ฉ”์ธ ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (DB ์„ธ์…˜ ๊ด€๋ฆฌ ์ถ”๊ฐ€)
164
+ async def search_gold_news():
165
+ print(f"\n=== ๐ŸŒŸ [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ์ž๋™ ๊ฒ€์ƒ‰ ๋ฐ DB ์ €์žฅ ์‹œ์ž‘ ===")
166
+
167
+ # DB ์„ธ์…˜ ์—ด๊ธฐ
168
+ db = SessionLocal()
169
+ try:
170
+ print("\n[๊ตญ๋‚ด ๋‰ด์Šค ๊ฒ€์ƒ‰ ์ค‘...]")
171
+ kr_news = fetch_filter_and_save_news("๊ธˆ", "ko", "KR", "KR:ko", db, max_news=3)
172
+ if not kr_news:
173
+ print("์ƒˆ๋กœ์šด ๊ตญ๋‚ด ๋‰ด์Šค๊ฐ€ ์—†๊ฑฐ๋‚˜ ๋ชจ๋‘ ์ด๋ฏธ ์ €์žฅ๋œ ๊ธฐ์‚ฌ์•ผ.")
174
+
175
+ print("\n[ํ•ด์™ธ ๋‰ด์Šค ๊ฒ€์ƒ‰ ์ค‘...]")
176
+ en_news = fetch_filter_and_save_news("gold", "en", "US", "US:en", db, max_news=3)
177
+ if not en_news:
178
+ print("์ƒˆ๋กœ์šด ํ•ด์™ธ ๋‰ด์Šค๊ฐ€ ์—†๊ฑฐ๋‚˜ ๋ชจ๋‘ ์ด๋ฏธ ์ €์žฅ๋œ ๊ธฐ์‚ฌ์•ผ.")
179
+
180
+ except Exception as e:
181
+ print(f"โŒ DB ์ €์žฅ ์ค‘ ์—๋Ÿฌ ๋ฐœ์ƒ: {e}")
182
+ db.rollback()
183
+ finally:
184
+ # ์ž‘์—…์ด ๋๋‚˜๋ฉด ๋ฌด์กฐ๊ฑด DB ์„ธ์…˜ ๋‹ซ๊ธฐ
185
+ db.close()
186
+
187
+ print("==========================================\n")
188
+
189
+ # ์Šค์ผ€์ค„๋Ÿฌ ์„ค์ • (๊ธฐ์กด๊ณผ ๋™์ผ)
190
+ def create_scheduler():
191
+ scheduler = AsyncIOScheduler(timezone="Asia/Seoul")
192
+ #scheduler.add_job(search_gold_news, CronTrigger(hour=18, minute=0))
193
+ scheduler.add_job(search_gold_news, IntervalTrigger(seconds=60))
194
+ return scheduler
195
+
196
+ news_scheduler = create_scheduler()
core/database.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine
2
+ from sqlalchemy.orm import sessionmaker, declarative_base
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ # .env ํŒŒ์ผ ๋กœ๋“œ (์žˆ์„ ๊ฒฝ์šฐ)
7
+ load_dotenv()
8
+
9
+ # ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์—ฐ๊ฒฐ URL
10
+ # ๋ณด์•ˆ์„ ์œ„ํ•ด ์‹ค์ œ ํ™˜๊ฒฝ์—์„œ๋Š” .env ํŒŒ์ผ์ด๋‚˜ ํ™˜๊ฒฝ ๋ณ€์ˆ˜์— DATABASE_URL์„ ์ €์žฅํ•˜๋Š” ๊ฒƒ์ด ์ข‹์Šต๋‹ˆ๋‹ค.
11
+ # ์—ฌ๊ธฐ์„œ๋Š” ํ•˜๋“œ์ฝ”๋”ฉ๋œ ๊ฐ’์„ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
12
+ SQLALCHEMY_DATABASE_URL = os.getenv("DATABASE_URL")
13
+
14
+ # SQLAlchemy ์—”์ง„ ์ƒ์„ฑ
15
+ # Neon DB์™€ ๊ฐ™์€ ํด๋ผ์šฐ๋“œ DB๋Š” ์—ฐ๊ฒฐ์ด ๋Š๊ธธ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ pool_pre_ping=True ์˜ต์…˜์„ ์ถ”๊ฐ€ํ•˜์—ฌ
16
+ # ์—ฐ๊ฒฐ์„ ํ™•์ธํ•œ ํ›„ ์‚ฌ์šฉํ•˜๋Š” ๊ฒƒ์ด ์ข‹์Šต๋‹ˆ๋‹ค.
17
+ engine = create_engine(
18
+ SQLALCHEMY_DATABASE_URL,
19
+ pool_pre_ping=True
20
+ )
21
+
22
+ # ์„ธ์…˜ ํŒฉํ† ๋ฆฌ ์ƒ์„ฑ
23
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
24
+
25
+ # Base ํด๋ž˜์Šค ์ƒ์„ฑ
26
+ Base = declarative_base()
27
+
28
+ # DB ์„ธ์…˜ ์˜์กด์„ฑ ์ฃผ์ž…์„ ์œ„ํ•œ ํ•จ์ˆ˜ (FastAPI ๋ผ์šฐํ„ฐ์—์„œ ์‚ฌ์šฉ)
29
+ def get_db():
30
+ db = SessionLocal()
31
+ try:
32
+ yield db
33
+ finally:
34
+ db.close()
core/dependencies.py CHANGED
@@ -70,8 +70,38 @@ class OnnxGemmaWrapper(Embeddings):
70
  def embed_query(self, text: str) -> List[float]:
71
  return self.encode_query(text).tolist()
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  # ์ „์—ญ ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ €์žฅ์†Œ
74
  _embedding_model = None
 
75
 
76
  def get_embedding_model() -> OnnxGemmaWrapper:
77
  """
@@ -84,3 +114,12 @@ def get_embedding_model() -> OnnxGemmaWrapper:
84
  token=hf_token
85
  )
86
  return _embedding_model
 
 
 
 
 
 
 
 
 
 
70
  def embed_query(self, text: str) -> List[float]:
71
  return self.encode_query(text).tolist()
72
 
73
+ import torch
74
+ import torchvision.transforms as transforms
75
+ from torchvision.models import efficientnet_v2_s, EfficientNet_V2_S_Weights
76
+ from PIL import Image
77
+
78
+ # ... (existing OnnxGemmaWrapper and get_embedding_model)
79
+
80
+ class EfficientNetV2Embedding:
81
+ def __init__(self):
82
+ print("Loading EfficientNetV2-S model...")
83
+ self.weights = EfficientNet_V2_S_Weights.DEFAULT
84
+ self.model = efficientnet_v2_s(weights=self.weights)
85
+ self.model.eval()
86
+
87
+ # Remove the classification head to get embeddings
88
+ self.model.classifier = torch.nn.Identity()
89
+
90
+ self.preprocess = self.weights.transforms()
91
+ print("EfficientNetV2-S model loaded successfully.")
92
+
93
+ def embed_image(self, image: Image.Image) -> List[float]:
94
+ # Preprocess image
95
+ img_tensor = self.preprocess(image).unsqueeze(0)
96
+
97
+ with torch.no_grad():
98
+ embedding = self.model(img_tensor)
99
+
100
+ return embedding.squeeze(0).tolist()
101
+
102
  # ์ „์—ญ ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ €์žฅ์†Œ
103
  _embedding_model = None
104
+ _image_embedding_model = None
105
 
106
  def get_embedding_model() -> OnnxGemmaWrapper:
107
  """
 
114
  token=hf_token
115
  )
116
  return _embedding_model
117
+
118
+ def get_image_embedding_model() -> EfficientNetV2Embedding:
119
+ """
120
+ EfficientNetV2-S ๋ชจ๋ธ์„ ์ตœ์ดˆ 1ํšŒ ๋กœ๋“œํ•˜์—ฌ ์‹ฑ๊ธ€ํ†ค์œผ๋กœ ์žฌ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
121
+ """
122
+ global _image_embedding_model
123
+ if _image_embedding_model is None:
124
+ _image_embedding_model = EfficientNetV2Embedding()
125
+ return _image_embedding_model
core/models.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, BigInteger, String, Text, DateTime
2
+ from pgvector.sqlalchemy import Vector
3
+ from core.database import Base
4
+
5
+ class NewsEmbedding(Base):
6
+ # ๋ณด์—ฌ์ค€ ์ด๋ฏธ์ง€์˜ ํ…Œ์ด๋ธ” ์ด๋ฆ„๊ณผ ์Šคํ‚ค๋งˆ๋ฅผ ๊ทธ๋Œ€๋กœ ๋ฐ˜์˜ํ–ˆ์–ด
7
+ __tablename__ = "t_test_textembedding"
8
+
9
+ id = Column(BigInteger, primary_key=True, autoincrement=True)
10
+ title = Column(String(500), nullable=False)
11
+ # Gemma ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ์˜ ๊ธฐ๋ณธ ์ถœ๋ ฅ ์ฐจ์›์ธ 768๋กœ ์„ค์ •
12
+ title_embedding = Column(Vector(768))
13
+ content = Column(Text, nullable=False)
14
+ content_embedding = Column(Vector(768))
15
+ created_at = Column(DateTime(timezone=True))
requirements.txt CHANGED
@@ -20,4 +20,12 @@ langchain-community
20
  langchain-huggingface
21
  langchain-google-genai
22
 
23
- onnxruntime
 
 
 
 
 
 
 
 
 
20
  langchain-huggingface
21
  langchain-google-genai
22
 
23
+ onnxruntime
24
+ apscheduler
25
+ feedparser
26
+ sqlalchemy
27
+ psycopg2-binary
28
+ pgvector
29
+ pytz
30
+ newspaper3k
31
+ lxml_html_clean
router/image_embedding_router.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, UploadFile, File
2
+ from pydantic import BaseModel
3
+ from typing import List, Optional, Any
4
+ from io import BytesIO
5
+ from PIL import Image
6
+
7
+ # ์ฝ”์–ด ๋ชจ๋“ˆ์—์„œ ๋ชจ๋ธ ๊ฐ€์ ธ์˜ค๊ธฐ (์‹ฑ๊ธ€ํ†ค ๋ณด์žฅ)
8
+ from core.dependencies import get_image_embedding_model
9
+
10
+ router = APIRouter(tags=["Image Embedding"])
11
+
12
+ # ๋ผ์šฐํ„ฐ ์ง„์ž…์ ์—์„œ ๋ชจ๋ธ์„ ํ™•๋ณด
13
+ image_embedding_model = get_image_embedding_model()
14
+
15
+ class ImageEmbeddingResponse(BaseModel):
16
+ success: bool
17
+ data: Optional[Any] = None
18
+ msg: Optional[str] = None
19
+
20
+ @router.post("/image_to_embedding", response_model=ImageEmbeddingResponse)
21
+ async def image_to_embedding(file: UploadFile = File(...)):
22
+ """
23
+ ์ด๋ฏธ์ง€ ํŒŒ์ผ์„ ์—…๋กœ๋“œ๋ฐ›์•„ EfficientNetV2-S ๋ชจ๋ธ๋กœ ์ž„๋ฒ ๋”ฉํ•œ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
24
+ """
25
+ try:
26
+ # ์ด๋ฏธ์ง€ ํŒŒ์ผ ์ฝ๊ธฐ
27
+ contents = await file.read()
28
+ image = Image.open(BytesIO(contents)).convert("RGB")
29
+
30
+ # ์ด๋ฏธ์ง€๋ฅผ ์ž„๋ฒ ๋”ฉ ๋ณ€ํ™˜. 1280 ์ฐจ์›
31
+ emb_vector = image_embedding_model.embed_image(image)
32
+
33
+ return {"success": True, "data": {"embedding": emb_vector}, "msg": ""}
34
+ except Exception as e:
35
+ return {"success": False, "data": None, "msg": str(e)}
test_gemini.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import feedparser
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import os
5
+ from langchain_google_genai import ChatGoogleGenerativeAI
6
+ from langchain_core.messages import HumanMessage
7
+
8
+ def test():
9
+ rss_url = "https://news.google.com/rss/search?q=%EA%B8%88&hl=ko&gl=KR&ceid=KR:ko"
10
+ feed = feedparser.parse(rss_url)
11
+ if not feed.entries:
12
+ return
13
+ entry = feed.entries[0]
14
+ print("Link:", entry.link)
15
+
16
+ try:
17
+ # fetch
18
+ r = requests.get(entry.link, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
19
+ soup = BeautifulSoup(r.text, 'html.parser')
20
+ text = soup.get_text(separator=' ', strip=True)
21
+ print("Text preview:", text[:200])
22
+
23
+ # Test Gemini
24
+ llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.1)
25
+
26
+ prompt = f"""
27
+ ๋‹ค์Œ์€ ๋‰ด์Šค ๊ธฐ์‚ฌ ์›๋ฌธ์ž…๋‹ˆ๋‹ค:
28
+ {text[:3000]}
29
+
30
+ ์ด ๋‰ด์Šค๊ฐ€ ๊ธˆ(Gold, ๊ท€๊ธˆ์†/ํˆฌ์ž์ž์‚ฐ)๊ณผ ๊ด€๋ จ๋œ ์‹ค์ œ ๋‰ด์Šค์ธ์ง€ ํŒ๋ณ„ํ•˜๊ณ ,
31
+ ๋งž๋‹ค๋ฉด ๊ธฐ์‚ฌ์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ 1~2์ค„๋กœ ์š”์•ฝํ•ด์ฃผ์„ธ์š”.
32
+ ๋งŒ์•ฝ ๊ธˆ๊ณผ ์ „ํ˜€ ๊ด€๋ จ์ด ์—†๋Š” ๋‰ด์Šค๋ผ๋ฉด (์˜ˆ: ๊ธˆ์š”์ผ, ์†ก๊ธˆ, ์‹œ์„ธ ์—†๋Š” ์ผ๋ฐ˜ ๊ธฐ์‚ฌ ๋“ฑ)
33
+ 'NOT_GOLD_NEWS' ๋ผ๊ณ ๋งŒ ์ •ํ™•ํžˆ ๋‹ต๋ณ€ํ•˜์„ธ์š”.
34
+
35
+ ์š”์•ฝ ๊ฒฐ๊ณผ:
36
+ """
37
+ response = llm.invoke([HumanMessage(content=prompt)])
38
+ print("\nGemini Response:", response.content)
39
+ except Exception as e:
40
+ print("Error:", e)
41
+
42
+ if __name__ == "__main__":
43
+ test()
test_image_embedding.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+
4
+ def test_image_embedding():
5
+ url = "http://localhost:8000/image_embedding/image_to_embedding"
6
+ image_path = r"C:\Users\itg\.gemini\antigravity\brain\a2d1bd2b-b329-461a-ab89-c0d64934f5fb\test_image_for_embedding_1772686600102.png"
7
+
8
+ if not os.path.exists(image_path):
9
+ print(f"Error: {image_path} not found.")
10
+ return
11
+
12
+ with open(image_path, "rb") as f:
13
+ files = {"file": (image_path, f, "image/png")}
14
+ try:
15
+ response = requests.post(url, files=files)
16
+ if response.status_code == 200:
17
+ result = response.json()
18
+ if result["success"]:
19
+ embedding = result["data"]["embedding"]
20
+ print(f"Successfully retrieved embedding. Dimension: {len(embedding)}")
21
+ # EfficientNetV2-S embedding dimension should be 1280
22
+ if len(embedding) == 1280:
23
+ print("Verification PASSED: Embedding dimension is 1280.")
24
+ else:
25
+ print(f"Verification FAILED: Expected dimension 1280, got {len(embedding)}.")
26
+ else:
27
+ print(f"API Error: {result['msg']}")
28
+ else:
29
+ print(f"HTTP Error: {response.status_code}")
30
+ except Exception as e:
31
+ print(f"Request failed: {e}")
32
+
33
+ if __name__ == "__main__":
34
+ test_image_embedding()
test_rss.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import feedparser
2
+
3
+ rss_url = "https://news.google.com/rss/search?q=%EB%A7%88%EC%9D%B4%ED%81%AC%EB%A1%9C%EC%86%8C%ED%94%84%ED%8A%B8&hl=ko&gl=KR&ceid=KR:ko"
4
+ feed = feedparser.parse(rss_url)
5
+
6
+ if feed.entries:
7
+ entry = feed.entries[0]
8
+ print(entry.keys())
9
+ print("Title:", entry.title)
10
+ print("Link:", entry.link)
11
+ print("Description:", entry.description)