Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,7 @@ import tempfile
|
|
| 10 |
import base64
|
| 11 |
from datetime import datetime
|
| 12 |
import re
|
|
|
|
| 13 |
|
| 14 |
# ๋ก๊น
์ค์
|
| 15 |
logging.basicConfig(
|
|
@@ -205,15 +206,24 @@ def generate_mock_search_results(query):
|
|
| 205 |
return notice + "\n".join(summary_lines)
|
| 206 |
|
| 207 |
# Google ๊ฒ์ ํจ์ (SerpAPI ๋์ ์ง์ ๊ฒ์)
|
|
|
|
| 208 |
def do_google_search(query, num_results=5):
|
| 209 |
try:
|
| 210 |
-
#
|
| 211 |
headers = {
|
| 212 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
}
|
| 214 |
|
| 215 |
-
# ๊ฒ์ URL
|
| 216 |
-
search_url = f"https://www.google.com/search?q={query}&num={num_results}"
|
| 217 |
logging.info(f"๊ตฌ๊ธ ๊ฒ์ URL: {search_url}")
|
| 218 |
|
| 219 |
# ์์ฒญ ๋ณด๋ด๊ธฐ (์งง์ ํ์์์ ์ค์ )
|
|
@@ -224,10 +234,92 @@ def do_google_search(query, num_results=5):
|
|
| 224 |
logging.error(f"Google ๊ฒ์ ์๋ต ์ํ ์ฝ๋: {response.status_code}")
|
| 225 |
return generate_mock_search_results(query)
|
| 226 |
|
| 227 |
-
#
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
except Exception as e:
|
| 233 |
logging.error(f"Google ๊ฒ์ ์คํจ: {e}")
|
|
@@ -613,5 +705,6 @@ if __name__ == "__main__":
|
|
| 613 |
f.write("requests>=2.32.3\n")
|
| 614 |
f.write("markdown>=3.5.1\n")
|
| 615 |
f.write("pillow>=10.1.0\n")
|
|
|
|
| 616 |
|
| 617 |
main()
|
|
|
|
| 10 |
import base64
|
| 11 |
from datetime import datetime
|
| 12 |
import re
|
| 13 |
+
from bs4 import BeautifulSoup # BeautifulSoup ์ถ๊ฐ
|
| 14 |
|
| 15 |
# ๋ก๊น
์ค์
|
| 16 |
logging.basicConfig(
|
|
|
|
| 206 |
return notice + "\n".join(summary_lines)
|
| 207 |
|
| 208 |
# Google ๊ฒ์ ํจ์ (SerpAPI ๋์ ์ง์ ๊ฒ์)
|
| 209 |
+
# Google ๊ฒ์ ํจ์ (BeautifulSoup์ ์ฌ์ฉํ์ฌ ๊ฒฐ๊ณผ ํ์ฑ)
|
| 210 |
def do_google_search(query, num_results=5):
|
| 211 |
try:
|
| 212 |
+
# ๋ค์ํ User-Agent ์ฌ์ฉ (Google ์ฐจ๋จ ๋ฐฉ์ง)
|
| 213 |
headers = {
|
| 214 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 215 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 216 |
+
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
| 217 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 218 |
+
'Referer': 'https://www.google.com/',
|
| 219 |
+
'DNT': '1',
|
| 220 |
+
'Connection': 'keep-alive',
|
| 221 |
+
'Upgrade-Insecure-Requests': '1',
|
| 222 |
+
'Cache-Control': 'max-age=0',
|
| 223 |
}
|
| 224 |
|
| 225 |
+
# ๊ฒ์ URL (์ผ๋ถ ํ๋ผ๋ฏธํฐ ์ถ๊ฐ)
|
| 226 |
+
search_url = f"https://www.google.com/search?q={query}&num={num_results}&hl=ko&gl=kr"
|
| 227 |
logging.info(f"๊ตฌ๊ธ ๊ฒ์ URL: {search_url}")
|
| 228 |
|
| 229 |
# ์์ฒญ ๋ณด๋ด๊ธฐ (์งง์ ํ์์์ ์ค์ )
|
|
|
|
| 234 |
logging.error(f"Google ๊ฒ์ ์๋ต ์ํ ์ฝ๋: {response.status_code}")
|
| 235 |
return generate_mock_search_results(query)
|
| 236 |
|
| 237 |
+
# BeautifulSoup์ผ๋ก HTML ํ์ฑ
|
| 238 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 239 |
+
|
| 240 |
+
# ๊ฒ์ ๊ฒฐ๊ณผ ์ถ์ถ
|
| 241 |
+
organic_results = []
|
| 242 |
+
|
| 243 |
+
# ๊ฒ์ ๊ฒฐ๊ณผ ์ปจํ
์ด๋ ์ฐพ๊ธฐ (Google์ HTML ๊ตฌ์กฐ์ ๋ฐ๋ผ ๋ณ๊ฒฝ๋ ์ ์์)
|
| 244 |
+
result_containers = soup.select('div.g')
|
| 245 |
+
|
| 246 |
+
if not result_containers:
|
| 247 |
+
logging.warning("Google ๊ฒ์ ๊ฒฐ๊ณผ ์ปจํ
์ด๋๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค. ๋์ฒด ์ ํ์๋ฅผ ์๋ํฉ๋๋ค.")
|
| 248 |
+
# ๋์ฒด ์ ํ์ ์๋
|
| 249 |
+
result_containers = soup.select('div[data-hveid]')
|
| 250 |
+
|
| 251 |
+
counter = 0
|
| 252 |
+
for container in result_containers:
|
| 253 |
+
if counter >= num_results:
|
| 254 |
+
break
|
| 255 |
+
|
| 256 |
+
# ์ ๋ชฉ ์ถ์ถ
|
| 257 |
+
title_element = container.select_one('h3')
|
| 258 |
+
if not title_element:
|
| 259 |
+
continue
|
| 260 |
+
|
| 261 |
+
title = title_element.get_text()
|
| 262 |
+
|
| 263 |
+
# ๋งํฌ ์ถ์ถ
|
| 264 |
+
link_element = container.select_one('a')
|
| 265 |
+
if not link_element:
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
+
link = link_element.get('href', '')
|
| 269 |
+
if link.startswith('/url?'):
|
| 270 |
+
# Google์ ๋ฆฌ๋ค์ด๋ ํธ URL์์ ์ค์ URL ์ถ์ถ
|
| 271 |
+
link = link.split('q=')[1].split('&')[0] if 'q=' in link else link
|
| 272 |
+
elif not link.startswith('http'):
|
| 273 |
+
continue
|
| 274 |
+
|
| 275 |
+
# ์ค๋ํซ ์ถ์ถ
|
| 276 |
+
snippet_element = container.select_one('div.VwiC3b') or container.select_one('span.aCOpRe')
|
| 277 |
+
snippet = snippet_element.get_text() if snippet_element else "์ค๋ช
์์"
|
| 278 |
+
|
| 279 |
+
# ํ์ ๋งํฌ ์ถ์ถ
|
| 280 |
+
displayed_link_element = container.select_one('cite')
|
| 281 |
+
displayed_link = displayed_link_element.get_text() if displayed_link_element else link
|
| 282 |
+
|
| 283 |
+
organic_results.append({
|
| 284 |
+
"title": title,
|
| 285 |
+
"link": link,
|
| 286 |
+
"snippet": snippet,
|
| 287 |
+
"displayed_link": displayed_link
|
| 288 |
+
})
|
| 289 |
+
counter += 1
|
| 290 |
+
|
| 291 |
+
if not organic_results:
|
| 292 |
+
logging.warning("๊ฒ์ ๊ฒฐ๊ณผ๋ฅผ ํ์ฑํ ์ ์์ต๋๋ค. ์ ํ์๊ฐ ๋ณ๊ฒฝ๋์์ ์ ์์ต๋๋ค.")
|
| 293 |
+
return generate_mock_search_results(query)
|
| 294 |
+
|
| 295 |
+
# ๊ฒ์ ๊ฒฐ๊ณผ ๋งํฌ๋ค์ด ํ์์ผ๋ก ๋ณํ
|
| 296 |
+
summary_lines = []
|
| 297 |
+
for idx, item in enumerate(organic_results, start=1):
|
| 298 |
+
title = item.get("title", "No title")
|
| 299 |
+
link = item.get("link", "#")
|
| 300 |
+
snippet = item.get("snippet", "No description")
|
| 301 |
+
displayed_link = item.get("displayed_link", link)
|
| 302 |
+
|
| 303 |
+
summary_lines.append(
|
| 304 |
+
f"### Result {idx}: {title}\n\n"
|
| 305 |
+
f"{snippet}\n\n"
|
| 306 |
+
f"**์ถ์ฒ**: [{displayed_link}]({link})\n\n"
|
| 307 |
+
f"---\n"
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
# ๋ชจ๋ธ์๊ฒ ๋ช
ํํ ์ง์นจ ์ถ๊ฐ
|
| 311 |
+
instructions = """
|
| 312 |
+
# ์น ๊ฒ์ ๊ฒฐ๊ณผ
|
| 313 |
+
์๋๋ ๊ฒ์ ๊ฒฐ๊ณผ์
๋๋ค. ์ง๋ฌธ์ ๋ต๋ณํ ๋ ์ด ์ ๋ณด๋ฅผ ํ์ฉํ์ธ์:
|
| 314 |
+
1. ๊ฐ ๊ฒฐ๊ณผ์ ์ ๋ชฉ, ๋ด์ฉ, ์ถ์ฒ ๋งํฌ๋ฅผ ์ฐธ๊ณ ํ์ธ์
|
| 315 |
+
2. ๋ต๋ณ์ ๊ด๋ จ ์ ๋ณด์ ์ถ์ฒ๋ฅผ ๋ช
์์ ์ผ๋ก ์ธ์ฉํ์ธ์ (์: "X ์ถ์ฒ์ ๋ฐ๋ฅด๋ฉด...")
|
| 316 |
+
3. ์๋ต์ ์ค์ ์ถ์ฒ ๋งํฌ๋ฅผ ํฌํจํ์ธ์
|
| 317 |
+
4. ์ฌ๋ฌ ์ถ์ฒ์ ์ ๋ณด๋ฅผ ์ข
ํฉํ์ฌ ๋ต๋ณํ์ธ์
|
| 318 |
+
"""
|
| 319 |
+
|
| 320 |
+
search_results = instructions + "\n".join(summary_lines)
|
| 321 |
+
logging.info(f"Google ๊ฒ์ ๊ฒฐ๊ณผ {len(organic_results)}๊ฐ ํ์ฑ ์๋ฃ")
|
| 322 |
+
return search_results
|
| 323 |
|
| 324 |
except Exception as e:
|
| 325 |
logging.error(f"Google ๊ฒ์ ์คํจ: {e}")
|
|
|
|
| 705 |
f.write("requests>=2.32.3\n")
|
| 706 |
f.write("markdown>=3.5.1\n")
|
| 707 |
f.write("pillow>=10.1.0\n")
|
| 708 |
+
f.write("beautifulsoup4>=4.12.0\n") # BeautifulSoup ์ถ๊ฐ
|
| 709 |
|
| 710 |
main()
|