nakas commited on
Commit
c218a7a
·
verified ·
1 Parent(s): 770c9e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -72
app.py CHANGED
@@ -1,21 +1,21 @@
1
  import gradio as gr
2
  from playwright.sync_api import sync_playwright
 
3
  import time
4
  import json
5
- from download_browsers import download_playwright_browsers
6
 
7
- # Download browsers on startup
8
- download_playwright_browsers()
9
-
10
- def scrape_website(url, wait_time=5):
11
  """
12
- Scrape a website using Playwright headless browser
13
  Args:
14
- url (str): The URL to scrape
15
- wait_time (int): Time to wait for dynamic content to load
16
  Returns:
17
- dict: Dictionary containing scraped data
18
  """
 
 
19
  try:
20
  with sync_playwright() as p:
21
  # Launch browser in headless mode
@@ -25,87 +25,128 @@ def scrape_website(url, wait_time=5):
25
  )
26
  page = context.new_page()
27
 
28
- # Go to URL and wait for network to be idle
29
  page.goto(url, wait_until="networkidle")
30
- time.sleep(wait_time) # Additional wait for dynamic content
31
 
32
- # Get basic page information
33
- title = page.title()
 
34
 
35
- # Extract all text content
36
- text_content = page.text_content('body')
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Extract all links
39
- links = page.eval_on_selector_all('a[href]', 'elements => elements.map(el => el.href)')
40
 
41
- # Extract all images
42
- images = page.eval_on_selector_all('img[src]', 'elements => elements.map(el => el.src)')
 
43
 
44
- # Get meta description
45
- meta_description = page.eval_on_selector('meta[name="description"]',
46
- 'element => element.content') if page.query_selector('meta[name="description"]') else ''
 
 
47
 
48
- # Close browser
49
- browser.close()
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  return {
52
- "title": title,
53
- "meta_description": meta_description,
54
- "text_content": text_content[:1000] + "...", # Truncate for display
55
- "links": links[:10], # Show first 10 links
56
- "images": images[:5], # Show first 5 images
57
- "status": "Success"
58
  }
59
 
60
  except Exception as e:
61
  return {
62
- "status": "Error",
63
- "error_message": str(e)
64
  }
65
 
66
  def format_output(result):
67
- """Format the output for better display in Gradio"""
68
- if result["status"] == "Error":
69
- return f"Error: {result['error_message']}"
70
 
71
- output = f"""
72
- ### Page Title
73
- {result['title']}
74
-
75
- ### Meta Description
76
- {result['meta_description']}
77
-
78
- ### First 1000 characters of content
79
- {result['text_content']}
80
-
81
- ### First 10 Links
82
- {json.dumps(result['links'], indent=2)}
83
-
84
- ### First 5 Images
85
- {json.dumps(result['images'], indent=2)}
86
- """
87
- return output
 
 
 
 
 
 
 
 
88
 
89
  # Create Gradio interface
90
- iface = gr.Interface(
91
- fn=lambda url, wait_time: format_output(scrape_website(url, wait_time)),
92
- inputs=[
93
- gr.Textbox(label="URL to scrape", placeholder="https://example.com"),
94
- gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Wait time (seconds)")
95
- ],
96
- outputs=gr.Markdown(),
97
- title="Web Scraper with Headless Browser",
98
- description="""
99
- Enter a URL to scrape its content using a headless browser.
100
- The tool will extract the title, meta description, text content, links, and images.
101
- Please use responsibly and respect websites' terms of service and robots.txt files.
102
- """,
103
- examples=[
104
- ["https://example.com", 5],
105
- ["https://news.ycombinator.com", 8]
106
- ]
107
- )
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- # Launch the interface
110
  if __name__ == "__main__":
111
- iface.launch()
 
1
  import gradio as gr
2
  from playwright.sync_api import sync_playwright
3
+ import pandas as pd
4
  import time
5
  import json
6
+ from datetime import datetime
7
 
8
+ def scrape_weather_data(site_id="YCTIM", hours=720):
 
 
 
9
  """
10
+ Scrape weather data from weather.gov timeseries
11
  Args:
12
+ site_id (str): The weather station ID
13
+ hours (int): Number of hours of data to retrieve
14
  Returns:
15
+ dict: Dictionary containing parsed weather data and statistics
16
  """
17
+ url = f"https://www.weather.gov/wrh/timeseries?site={site_id}&hours={hours}&units=english&chart=on&headers=on&obs=tabular&hourly=false&pview=full&font=12&plot="
18
+
19
  try:
20
  with sync_playwright() as p:
21
  # Launch browser in headless mode
 
25
  )
26
  page = context.new_page()
27
 
28
+ # Go to URL and wait for data to load
29
  page.goto(url, wait_until="networkidle")
30
+ time.sleep(5) # Additional wait for dynamic content
31
 
32
+ # Get the data table
33
+ table_selector = '#obsTable'
34
+ table_exists = page.wait_for_selector(table_selector, timeout=10000)
35
 
36
+ if not table_exists:
37
+ raise Exception("Weather data table not found")
38
+
39
+ # Extract table data
40
+ data = page.evaluate('''() => {
41
+ const table = document.querySelector('#obsTable');
42
+ const headers = Array.from(table.querySelectorAll('thead th')).map(th => th.textContent.trim());
43
+ const rows = Array.from(table.querySelectorAll('tbody tr')).map(row => {
44
+ return Array.from(row.querySelectorAll('td')).map(td => td.textContent.trim());
45
+ });
46
+ return {headers, rows};
47
+ }''')
48
 
49
+ # Close browser
50
+ browser.close()
51
 
52
+ # Process the data
53
+ headers = [h.replace('\n', ' ').strip() for h in data['headers']]
54
+ df = pd.DataFrame(data['rows'], columns=headers)
55
 
56
+ # Convert numeric columns
57
+ numeric_columns = ['Temp. (°F)', 'Dew Point (°F)', 'Relative Humidity (%)', 'Wind Chill (°F)', 'Snow Depth (in)']
58
+ for col in numeric_columns:
59
+ if col in df.columns:
60
+ df[col] = pd.to_numeric(df[col], errors='coerce')
61
 
62
+ # Parse wind speed and gusts
63
+ if 'Wind Speed (mph)' in df.columns:
64
+ df[['Wind Speed', 'Wind Gust']] = df['Wind Speed (mph)'].str.extract(r'(\d+)G(\d+)').astype(float)
65
+
66
+ # Calculate statistics
67
+ stats = {
68
+ 'Temperature Range': f"{df['Temp. (°F)'].min():.1f}°F to {df['Temp. (°F)'].max():.1f}°F",
69
+ 'Average Temperature': f"{df['Temp. (°F)'].mean():.1f}°F",
70
+ 'Max Wind Speed': f"{df['Wind Speed'].max():.1f} mph",
71
+ 'Max Wind Gust': f"{df['Wind Gust'].max():.1f} mph",
72
+ 'Average Humidity': f"{df['Relative Humidity (%)'].mean():.1f}%",
73
+ 'Max Snow Depth': f"{df['Snow Depth (in)'].max():.1f} inches"
74
+ }
75
 
76
  return {
77
+ 'status': 'Success',
78
+ 'statistics': stats,
79
+ 'data': df.to_dict('records')
 
 
 
80
  }
81
 
82
  except Exception as e:
83
  return {
84
+ 'status': 'Error',
85
+ 'error_message': str(e)
86
  }
87
 
88
  def format_output(result):
89
+ """Format the output for display in Gradio"""
90
+ if result['status'] == 'Error':
91
+ return f"Error: {result['error_message']}", None, None
92
 
93
+ # Create statistics HTML
94
+ stats_html = "<div style='font-size: 16px; line-height: 1.5;'>"
95
+ for key, value in result['statistics'].items():
96
+ stats_html += f"<p><strong>{key}:</strong> {value}</p>"
97
+ stats_html += "</div>"
98
+
99
+ # Convert data back to DataFrame for plotting
100
+ df = pd.DataFrame(result['data'])
101
+ df['Date/Time'] = pd.to_datetime(df['Date/Time'])
102
+
103
+ # Create temperature plot
104
+ temp_fig = gr.Plot()
105
+ df.plot(x='Date/Time', y=['Temp. (°F)', 'Wind Chill (°F)'],
106
+ title='Temperature and Wind Chill Over Time',
107
+ figsize=(12, 6))
108
+ temp_fig.pyplot()
109
+
110
+ # Create wind plot
111
+ wind_fig = gr.Plot()
112
+ df.plot(x='Date/Time', y=['Wind Speed', 'Wind Gust'],
113
+ title='Wind Speed and Gusts Over Time',
114
+ figsize=(12, 6))
115
+ wind_fig.pyplot()
116
+
117
+ return stats_html, temp_fig, wind_fig
118
 
119
  # Create Gradio interface
120
+ with gr.Blocks(title="Weather Station Data Analyzer") as demo:
121
+ gr.Markdown("# Weather Station Data Analyzer")
122
+
123
+ with gr.Row():
124
+ site_id = gr.Textbox(
125
+ label="Weather Station ID",
126
+ value="YCTIM",
127
+ placeholder="Enter station ID (e.g., YCTIM)"
128
+ )
129
+ hours = gr.Number(
130
+ label="Hours of Data",
131
+ value=720,
132
+ minimum=1,
133
+ maximum=1440
134
+ )
135
+
136
+ analyze_btn = gr.Button("Fetch and Analyze Weather Data")
137
+
138
+ with gr.Row():
139
+ stats_output = gr.HTML(label="Statistics")
140
+
141
+ with gr.Row():
142
+ temp_plot = gr.Plot(label="Temperature Plot")
143
+ wind_plot = gr.Plot(label="Wind Plot")
144
+
145
+ analyze_btn.click(
146
+ fn=lambda sid, hrs: format_output(scrape_weather_data(sid, hrs)),
147
+ inputs=[site_id, hours],
148
+ outputs=[stats_output, temp_plot, wind_plot]
149
+ )
150
 
 
151
  if __name__ == "__main__":
152
+ demo.launch()