IotaCluster commited on
Commit
bf53a82
·
verified ·
1 Parent(s): 1c8c582

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -75
app.py CHANGED
@@ -1,80 +1,37 @@
1
- import re
2
  import gradio as gr
3
- import requests
4
- from docx import Document
5
- from PIL import Image
6
- from io import BytesIO
7
-
8
- SITESHOT_API_KEY = "your_siteshot_api_key_here" # <-- Replace with your actual API key
9
-
10
- def extract_first_table_as_dict(doc_path):
11
- doc = Document(doc_path)
12
- if not doc.tables:
13
- return []
14
-
15
- table = doc.tables[0]
16
- keys = [cell.text.strip() for cell in table.rows[0].cells]
17
- data = []
18
-
19
- for row in table.rows[1:]:
20
- values = [cell.text.strip() for cell in row.cells]
21
- row_dict = dict(zip(keys, values))
22
- data.append(row_dict)
23
-
24
- return data
25
-
26
- def extract_urls_from_dict_list(dict_list):
27
- url_pattern = r'https?://[^\s)>\]]+'
28
- urls = set()
29
- for entry in dict_list:
30
- for value in entry.values():
31
- found = re.findall(url_pattern, value)
32
- urls.update(found)
33
- return list(urls)
34
-
35
- def capture_screenshot(doc_file):
36
- # Step 1: Extract table
37
- table_data = extract_first_table_as_dict(doc_file.name)
38
-
39
- # Step 2: Extract URLs
40
- urls = extract_urls_from_dict_list(table_data)
41
-
42
- # Step 3: Take screenshots
43
- screenshots = []
44
- for url in urls:
45
- try:
46
- response = requests.get(
47
- "https://api.screenshotapi.net/screenshot",
48
- params={
49
- "token": SITESHOT_API_KEY,
50
- "url": url,
51
- "output": "image",
52
- "file_type": "png",
53
- "full_page": "true"
54
- },
55
- stream=True
56
- )
57
- if response.status_code == 200:
58
- image = Image.open(BytesIO(response.content))
59
- screenshots.append(image)
60
- else:
61
- print(f"Failed screenshot for {url}: {response.status_code}")
62
- except Exception as e:
63
- print(f"Error processing {url}: {e}")
64
-
65
- return table_data, screenshots
66
-
67
- # Gradio Interface
68
- app = gr.Interface(
69
  fn=capture_screenshot,
70
- inputs=gr.File(label="Upload Word (.docx) File", file_types=[".docx"]),
71
- outputs=[
72
- gr.JSON(label="Extracted Table Data"),
73
- gr.Gallery(label="Webpage Screenshots").style(grid=[2], height="auto")
74
- ],
75
- title="📄 DOCX Table Extractor + 🌐 URL Screenshotter",
76
- description="Upload a Word file. This app extracts the first table, finds any links, and screenshots them using SiteShot."
77
  )
78
 
79
  if __name__ == "__main__":
80
- app.launch(share=True)
 
 
1
  import gradio as gr
2
+ from selenium import webdriver
3
+ from selenium.webdriver.chrome.options import Options
4
+ from webdriver_manager.chrome import ChromeDriverManager
5
+ import time
6
+
7
+ def capture_screenshot(url):
8
+ options = Options()
9
+ options.add_argument("--headless")
10
+ options.add_argument("--no-sandbox")
11
+ options.add_argument("--disable-dev-shm-usage")
12
+ options.add_argument("--window-size=1920,1080")
13
+
14
+ driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
15
+
16
+ screenshot_path = "screenshot.png"
17
+ try:
18
+ driver.get(url)
19
+ time.sleep(2)
20
+ driver.save_screenshot(screenshot_path)
21
+ except Exception as e:
22
+ return f"Error: {str(e)}"
23
+ finally:
24
+ driver.quit()
25
+
26
+ return screenshot_path
27
+
28
+ demo = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  fn=capture_screenshot,
30
+ inputs=gr.Textbox(label="Enter URL"),
31
+ outputs=gr.Image(label="Screenshot"),
32
+ title="Web Page Screenshot Tool",
33
+ description="Enter a URL and get a screenshot using headless Chrome.",
 
 
 
34
  )
35
 
36
  if __name__ == "__main__":
37
+ demo.launch()