Chris4K commited on
Commit
c0d93a3
·
verified ·
1 Parent(s): 1360b7c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +307 -0
app.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pytz
5
+ from datetime import datetime, timedelta
6
+ import logging
7
+ import traceback
8
+ from typing import List, Dict, Any
9
+ import hashlib
10
+ import icalendar
11
+ import uuid
12
+ import re
13
+ import json
14
+
15
+ # Hugging Face imports
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer
17
+ import torch
18
+
19
+ class EventScraper:
20
+ def __init__(self, urls, timezone='Europe/Berlin'):
21
+ # Setup logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ self.logger = logging.getLogger(__name__)
24
+
25
+ # Timezone setup
26
+ self.timezone = pytz.timezone(timezone)
27
+
28
+ # URLs to scrape
29
+ self.urls = urls if isinstance(urls, list) else [urls]
30
+
31
+ # Event cache to prevent duplicates
32
+ self.event_cache = set()
33
+
34
+ # iCal calendar
35
+ self.calendar = icalendar.Calendar()
36
+ self.calendar.add('prodid', '-//Event Scraper//example.com//')
37
+ self.calendar.add('version', '2.0')
38
+
39
+ # Model and tokenizer will be loaded on first use
40
+ self.model = None
41
+ self.tokenizer = None
42
+
43
+ def setup_llm(self):
44
+ """Setup Hugging Face LLM for event extraction"""
45
+ if self.model is not None and self.tokenizer is not None:
46
+ return
47
+
48
+ try:
49
+ model_name = "meta-llama/Llama-3.2-3B-Instruct"
50
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
51
+ self.model = AutoModelForCausalLM.from_pretrained(
52
+ model_name,
53
+ torch_dtype=torch.float16,
54
+ return_dict_in_generate=False,
55
+ device_map='auto'
56
+ )
57
+ except Exception as e:
58
+ gr.Warning(f"LLM Setup Error: {str(e)}")
59
+ raise
60
+
61
+ def fetch_webpage_content(self, url):
62
+ """Fetch webpage content"""
63
+ try:
64
+ headers = {
65
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
66
+ }
67
+ response = requests.get(url, headers=headers, timeout=10)
68
+ response.raise_for_status()
69
+ return response.text
70
+ except Exception as e:
71
+ gr.Warning(f"Error fetching {url}: {str(e)}")
72
+ return ""
73
+
74
+ def extract_text_from_html(self, html_content):
75
+ """Extract readable text from HTML"""
76
+ soup = BeautifulSoup(html_content, 'html.parser')
77
+
78
+ for script in soup(["script", "style", "nav", "header", "footer"]):
79
+ script.decompose()
80
+
81
+ text = soup.get_text(separator=' ', strip=True)
82
+ return ' '.join(text.split()[:2000])
83
+
84
+ def generate_event_extraction_prompt(self, text):
85
+ """Create prompt for LLM to extract event details"""
86
+ prompt = f"""You are an event extraction assistant.
87
+ Find and extract all events from the following text.
88
+ For each event, provide:
89
+ - Exact event name
90
+ - Date (DD.MM.YYYY)
91
+ - Time (HH:MM if available)
92
+ - Location
93
+ - Short description
94
+
95
+ Important: Extract ALL possible events.
96
+ Text to analyze:
97
+ {text}
98
+
99
+ Output ONLY a JSON list of events like this - Response Format:
100
+ [
101
+ {{
102
+ "name": "Event Name",
103
+ "date": "07.12.2024",
104
+ "time": "19:00",
105
+ "location": "Event Location",
106
+ "description": "Event details"
107
+ }}
108
+ ]
109
+
110
+ If NO events are found, return an empty list [].
111
+ Only return the json. nothing else. no comments."""
112
+ return prompt
113
+
114
+ def parse_llm_response(self, response):
115
+ """Parse LLM's text response into structured events"""
116
+ try:
117
+ # Clean the response
118
+ response = response.strip()
119
+
120
+ # Try parsing as JSON
121
+ try:
122
+ events = json.loads(response)
123
+
124
+ # Ensure it's a list
125
+ if not isinstance(events, list):
126
+ events = [events]
127
+
128
+ return events
129
+ except json.JSONDecodeError:
130
+ # If direct JSON parsing fails, try extracting JSON
131
+ json_match = re.search(r'\[.*\]', response, re.DOTALL)
132
+ if json_match:
133
+ try:
134
+ events = json.loads(json_match.group(0))
135
+ return events
136
+ except json.JSONDecodeError:
137
+ pass
138
+
139
+ gr.Warning(f"Failed to parse response: {response}")
140
+ return []
141
+ except Exception as e:
142
+ gr.Warning(f"Parsing error: {str(e)}")
143
+ return []
144
+
145
+ def scrape_events(self):
146
+ """Main method to scrape events from all URLs"""
147
+ # Ensure LLM is set up
148
+ self.setup_llm()
149
+
150
+ all_events = []
151
+
152
+ for url in self.urls:
153
+ try:
154
+ # Fetch webpage
155
+ html_content = self.fetch_webpage_content(url)
156
+
157
+ # Extract readable text
158
+ text_content = self.extract_text_from_html(html_content)
159
+
160
+ # Generate prompt
161
+ prompt = self.generate_event_extraction_prompt(text_content)
162
+
163
+ # Tokenize and generate
164
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
165
+ outputs = self.model.generate(
166
+ inputs.input_ids,
167
+ max_new_tokens=12000,
168
+ do_sample=True,
169
+ temperature=0.9
170
+ )
171
+
172
+ # Decode response
173
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
174
+
175
+ # Parse events
176
+ parsed_events = self.parse_llm_response(response)
177
+
178
+ # Deduplicate and add
179
+ for event in parsed_events:
180
+ event_hash = hashlib.md5(str(event).encode()).hexdigest()
181
+ if event_hash not in self.event_cache:
182
+ self.event_cache.add(event_hash)
183
+ all_events.append(event)
184
+
185
+ # Create and add iCal event
186
+ try:
187
+ ical_event = self.create_ical_event(event)
188
+ self.calendar.add_component(ical_event)
189
+ except Exception as ical_error:
190
+ gr.Warning(f"iCal creation error: {str(ical_error)}")
191
+
192
+ except Exception as e:
193
+ gr.Warning(f"Error processing {url}: {str(e)}")
194
+
195
+ return all_events
196
+
197
+ def create_ical_event(self, event):
198
+ """Convert event to iCal format"""
199
+ ical_event = icalendar.Event()
200
+
201
+ # Set unique identifier
202
+ ical_event.add('uid', str(uuid.uuid4()))
203
+
204
+ # Add summary (name)
205
+ ical_event.add('summary', event.get('name', 'Unnamed Event'))
206
+
207
+ # Add description
208
+ ical_event.add('description', event.get('description', ''))
209
+
210
+ # Add location
211
+ if event.get('location'):
212
+ ical_event.add('location', event['location'])
213
+
214
+ # Handle date and time
215
+ try:
216
+ # Parse date
217
+ if event.get('date'):
218
+ try:
219
+ event_date = datetime.strptime(event['date'], '%d.%m.%Y').date()
220
+
221
+ # Parse time if available
222
+ event_time = None
223
+ if event.get('time'):
224
+ event_time = datetime.strptime(event['time'], '%H:%M').time()
225
+
226
+ if event_time:
227
+ event_datetime = datetime.combine(event_date, event_time)
228
+ ical_event.add('dtstart', event_datetime)
229
+ ical_event.add('dtend', event_datetime + timedelta(hours=1))
230
+ else:
231
+ # Full day event
232
+ ical_event.add('dtstart', event_date)
233
+ ical_event.add('dtend', event_date + timedelta(days=1))
234
+ ical_event.add('x-microsoft-cdo-alldayevent', 'TRUE')
235
+ except ValueError as date_err:
236
+ gr.Warning(f"Date parsing error: {date_err}")
237
+ except Exception as e:
238
+ gr.Warning(f"iCal event creation error: {str(e)}")
239
+
240
+ return ical_event
241
+
242
+ def get_ical_string(self):
243
+ """Return iCal as a string"""
244
+ return self.calendar.to_ical().decode('utf-8')
245
+
246
+ def scrape_events_with_urls(urls):
247
+ """Wrapper function for Gradio interface"""
248
+ # Split URLs by newline or comma
249
+ url_list = [url.strip() for url in re.split(r'[\n,]+', urls) if url.strip()]
250
+
251
+ if not url_list:
252
+ gr.Warning("Please provide at least one valid URL.")
253
+ return [], ""
254
+
255
+ try:
256
+ # Initialize scraper
257
+ scraper = EventScraper(url_list)
258
+
259
+ # Scrape events
260
+ events = scraper.scrape_events()
261
+
262
+ # Prepare events output
263
+ events_str = json.dumps(events, indent=2)
264
+
265
+ # Get iCal string
266
+ ical_string = scraper.get_ical_string()
267
+
268
+ return events_str, ical_string
269
+
270
+ except Exception as e:
271
+ gr.Warning(f"Error in event scraping: {str(e)}")
272
+ return [], ""
273
+
274
+ # Create Gradio Interface
275
+ def create_gradio_app():
276
+ with gr.Blocks() as demo:
277
+ gr.Markdown("# Event Scraper 🗓️")
278
+ gr.Markdown("Scrape events from web pages using an AI-powered event extraction tool.")
279
+
280
+ with gr.Row():
281
+ with gr.Column():
282
+ url_input = gr.Textbox(
283
+ label="Enter URLs (comma or newline separated)",
284
+ placeholder="https://example.com/events\nhttps://another-site.com/calendar"
285
+ )
286
+ scrape_btn = gr.Button("Scrape Events", variant="primary")
287
+
288
+ with gr.Row():
289
+ with gr.Column():
290
+ events_output = gr.Code(label="Extracted Events (JSON)", language="json")
291
+ with gr.Column():
292
+ ical_output = gr.Code(label="iCal Export", language="")
293
+
294
+ scrape_btn.click(
295
+ fn=scrape_events_with_urls,
296
+ inputs=url_input,
297
+ outputs=[events_output, ical_output]
298
+ )
299
+
300
+ gr.Markdown("**Note:** Requires an internet connection and may take a few minutes to process.")
301
+
302
+ return demo
303
+
304
+ # Launch the app
305
+ if __name__ == "__main__":
306
+ demo = create_gradio_app()
307
+ demo.launch()