Robin Chiu commited on
Commit
6185b4f
·
1 Parent(s): b891a5d

init version

Browse files
Files changed (4) hide show
  1. app.py +131 -0
  2. pyproject.toml +10 -0
  3. requirements.txt +9 -0
  4. uv.lock +0 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from bs4 import BeautifulSoup
3
+
4
+ def parse_news_item(html: str) -> dict:
5
+ soup = BeautifulSoup(html, "html.parser")
6
+
7
+ # Get the anchor tag containing the link
8
+ link_tag = soup.find("a", href=True)
9
+ link = link_tag["href"] if link_tag else None
10
+
11
+ # Get the headline inside <h3>
12
+ headline_tag = soup.find("h3", class_="story__headline")
13
+ headline = headline_tag.get_text(strip=True) if headline_tag else None
14
+
15
+ # Get the text inside <p>
16
+ text_tag = soup.find("p", class_="story__text")
17
+ text = text_tag.get_text(strip=True) if text_tag else None
18
+
19
+ # Get the time inside <time>
20
+ time_tag = soup.find("time")
21
+ time = time_tag.get_text(strip=True) if time_tag else None
22
+
23
+ return {
24
+ "link": link,
25
+ "time": time,
26
+ "headline": headline,
27
+ "text": text,
28
+ }
29
+
30
+
31
+ # %%
32
+ import requests
33
+ from bs4 import BeautifulSoup
34
+
35
+ def search_news(keyword, page=1):
36
+ """
37
+ Fetch news articles related to a keyword from udn.com.
38
+
39
+ Args:
40
+ keyword: The search keyword for news articles.
41
+ page: The page number to fetch (default is 1).
42
+
43
+ Returns:
44
+ A list of dictionaries containing news article data.
45
+ """
46
+ url = f"https://money.udn.com/search/result/1001/{keyword}/{page}"
47
+ response = requests.get(url)
48
+
49
+ if response.status_code != 200:
50
+ print(f"Failed to retrieve data: {response.status_code}")
51
+ return []
52
+
53
+ soup = BeautifulSoup(response.text, 'html.parser')
54
+ articles = soup.select('div > div > main > section > ul > li')
55
+
56
+ results = []
57
+ for article in articles:
58
+ article_html = article.prettify()
59
+ data = parse_news_item(article_html)
60
+ # change dict to list
61
+ data_list = list(data.values())
62
+ results.append(data_list)
63
+
64
+ return results
65
+
66
+ # search_news('台積電', 1) # Example usage to fetch news articles related to '台積電'
67
+
68
+ # %%
69
+ # write a function to get the url and parse the content
70
+ def get_content(url):
71
+ """
72
+ Fetch and parse the content of a given URL.
73
+
74
+ Args:
75
+ url: The URL to fetch and parse.
76
+
77
+ Returns:
78
+ A dictionary containing the title, text content, and HTML of the page.
79
+ """
80
+ response = requests.get(url)
81
+
82
+ if response.status_code != 200:
83
+ print(f"Failed to retrieve {url}: {response.status_code}")
84
+ return None
85
+
86
+ soup = BeautifulSoup(response.text, 'html.parser')
87
+
88
+ # using select to get the text inside the #article_body
89
+ # This assumes the content is inside an element with id="article_body"
90
+ article_body = soup.select_one('#article_body')
91
+ text_content = ''
92
+ if article_body:
93
+ text_content = article_body.get_text(separator='\n', strip=True)
94
+
95
+ return {
96
+ 'link': url,
97
+ 'title': soup.title.string if soup.title else 'No title',
98
+ 'text': text_content
99
+ }
100
+
101
+
102
+ # get_content('https://money.udn.com/money/story/5612/8832289?from=edn_search_result') # Example usage to fetch content from a specific URL
103
+
104
+ # %%
105
+ # using the gradio to create two tab
106
+ # 1. search news
107
+ # 2. get content from url
108
+ import gradio as gr
109
+ def main():
110
+ with gr.Blocks() as demo:
111
+ gr.Markdown("# News Search and Content Fetcher")
112
+
113
+ with gr.Tab("Search News"):
114
+ keyword = gr.Textbox(label="Keyword", placeholder="Enter keyword to search news")
115
+ page = gr.Number(label="Page Number", value=1, step=1)
116
+ search_button = gr.Button("Search")
117
+ search_results = gr.DataFrame(label="Search Results", headers=["Link", "Time", "Headline", "Text"])
118
+
119
+ search_button.click(search_news, inputs=[keyword, page], outputs=search_results)
120
+
121
+ with gr.Tab("Get Content from URL"):
122
+ url_input = gr.Textbox(label="URL", placeholder="Enter URL to fetch content")
123
+ content_output = gr.JSON(label="Content Output")
124
+
125
+ url_input.submit(get_content, inputs=url_input, outputs=content_output)
126
+ demo.launch(mcp_server=True, server_name="0.0.0.0",allowed_paths=["/"])
127
+
128
+ if __name__ == "__main__":
129
+ main()
130
+
131
+
pyproject.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "newsagent"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "bs4>=0.0.2",
9
+ "gradio[mcp]>=5.33.1",
10
+ ]
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ duckduckgo_search
2
+ smolagents
3
+ gradio[mcp]
4
+ datasets
5
+ langchain
6
+ langchain-chroma
7
+ langchain-text-splitters
8
+ langchain-community
9
+ sentence-transformers
uv.lock ADDED
The diff for this file is too large to render. See raw diff