Pamudu13 commited on
Commit
19a4a86
·
verified ·
1 Parent(s): 58c842e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -252
app.py CHANGED
@@ -1,265 +1,40 @@
1
- from flask import Flask, jsonify, request
2
  import requests
3
- from bs4 import BeautifulSoup
4
  import os
5
- import re
6
- import urllib.parse
7
- import time
8
- import random
9
- import base64
10
- from io import BytesIO
11
- from urllib.parse import urlparse
12
- import html2text
13
 
14
  app = Flask(__name__)
15
 
16
- def search_images(query, num_images=5):
17
- # Headers to mimic a browser request
18
- headers = {
19
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
20
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
21
- 'Accept-Language': 'en-US,en;q=0.5',
22
- 'Accept-Encoding': 'gzip, deflate',
23
- 'DNT': '1',
24
- 'Connection': 'keep-alive',
25
- }
26
-
27
- # Format the query for URL
28
- formatted_query = urllib.parse.quote(query)
29
-
30
- # Google Images URL
31
- url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
32
 
 
 
33
  try:
34
- # Get the HTML content
35
- response = requests.get(url, headers=headers, timeout=30)
36
- response.raise_for_status()
37
-
38
- # Find all image URLs using regex
39
- image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
40
-
41
- # Remove duplicates while preserving order
42
- image_urls = list(dict.fromkeys(image_urls))
43
-
44
- # Store results
45
- results = []
46
- downloaded = 0
47
-
48
- for img_url in image_urls:
49
- if downloaded >= num_images:
50
- break
51
-
52
- try:
53
- # Skip small thumbnails and icons
54
- if 'gstatic.com' in img_url or 'google.com' in img_url:
55
- continue
56
-
57
- # Download image
58
- img_response = requests.get(img_url, headers=headers, timeout=10)
59
- img_response.raise_for_status()
60
-
61
- # Check if the response is actually an image
62
- content_type = img_response.headers.get('Content-Type', '')
63
- if not content_type.startswith('image/'):
64
- continue
65
-
66
- # Convert image to base64
67
- image_base64 = base64.b64encode(img_response.content).decode('utf-8')
68
-
69
- # Add to results
70
- results.append({
71
- 'image_url': img_url,
72
- 'base64_data': f"data:{content_type};base64,{image_base64}"
73
- })
74
-
75
- downloaded += 1
76
-
77
- # Add a random delay between downloads
78
- time.sleep(random.uniform(0.5, 1))
79
-
80
- except Exception as e:
81
- print(f"Error downloading image: {str(e)}")
82
- continue
83
-
84
- return results
85
-
86
- except Exception as e:
87
- print(f"An error occurred: {str(e)}")
88
- return []
89
-
90
- @app.route('/search_images', methods=['GET'])
91
- def api_search_images():
92
- try:
93
- # Get query parameters
94
- query = request.args.get('query', '')
95
- num_images = int(request.args.get('num_images', 5))
96
-
97
- if not query:
98
- return jsonify({'error': 'Query parameter is required'}), 400
99
-
100
- if num_images < 1 or num_images > 20:
101
- return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
102
-
103
- # Search for images
104
- results = search_images(query, num_images)
105
-
106
- return jsonify({
107
- 'success': True,
108
- 'query': query,
109
- 'results': results
110
- })
111
-
112
- except Exception as e:
113
- return jsonify({
114
- 'success': False,
115
- 'error': str(e)
116
- }), 500
117
-
118
- def get_domain(url):
119
- """Extract domain from URL"""
120
- parsed_uri = urlparse(url)
121
- return parsed_uri.netloc
122
-
123
- def clean_text(text):
124
- """Clean scraped text"""
125
- # Remove extra whitespace
126
- text = re.sub(r'\s+', ' ', text)
127
- # Remove special characters
128
- text = re.sub(r'[^\w\s.,!?-]', '', text)
129
- return text.strip()
130
-
131
- def scrape_website(url, headers):
132
- """Scrape content from a single website"""
133
- try:
134
- response = requests.get(url, headers=headers, timeout=10)
135
- response.raise_for_status()
136
-
137
- soup = BeautifulSoup(response.text, 'html.parser')
138
-
139
- # Remove unwanted elements
140
- for element in soup(['script', 'style', 'nav', 'footer', 'iframe']):
141
- element.decompose()
142
-
143
- # Convert HTML to text
144
- h = html2text.HTML2Text()
145
- h.ignore_links = True
146
- h.ignore_images = True
147
- text = h.handle(str(soup))
148
-
149
- # Clean the text
150
- text = clean_text(text)
151
-
152
- # Get meta description
153
- meta_desc = ''
154
- meta_tag = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
155
- if meta_tag:
156
- meta_desc = meta_tag.get('content', '')
157
-
158
- # Get title
159
- title = soup.title.string if soup.title else ''
160
-
161
- return {
162
- 'title': clean_text(title),
163
- 'meta_description': clean_text(meta_desc),
164
- 'content': text[:10000], # Limit content length
165
- 'url': url
166
  }
167
 
168
- except Exception as e:
169
- print(f"Error scraping {url}: {str(e)}")
170
- return None
171
-
172
- def search_and_scrape(query, num_results=5):
173
- headers = {
174
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
175
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
176
- 'Accept-Language': 'en-US,en;q=0.5',
177
- 'Accept-Encoding': 'gzip, deflate',
178
- 'DNT': '1',
179
- 'Connection': 'keep-alive',
180
- }
181
-
182
- # Format the query for URL
183
- formatted_query = urllib.parse.quote(query)
184
-
185
- # Google Search URL
186
- url = f"https://www.google.com/search?q={formatted_query}&num={num_results}"
187
-
188
- try:
189
- # Get Google search results
190
- response = requests.get(url, headers=headers, timeout=30)
191
- response.raise_for_status()
192
-
193
- soup = BeautifulSoup(response.text, 'html.parser')
194
-
195
- # Find all search result divs
196
- search_results = []
197
- result_divs = soup.find_all('div', class_='g')
198
-
199
- for div in result_divs:
200
- # Find the link
201
- link = div.find('a')
202
- if not link:
203
- continue
204
-
205
- href = link.get('href', '')
206
-
207
- # Skip if not a valid URL or if it's a Google-related URL
208
- # if not href.startswith('http') or 'google.' in href:
209
- # continue
210
-
211
- # Add random delay between requests
212
- time.sleep(random.uniform(1, 2))
213
-
214
- # Scrape the website
215
- site_data = scrape_website(href, headers)
216
- if site_data:
217
- search_results.append(site_data)
218
-
219
- if len(search_results) >= num_results:
220
- break
221
-
222
- return search_results
223
 
224
  except Exception as e:
225
- print(f"An error occurred: {str(e)}")
226
- return []
227
-
228
- @app.route('/scrape_sites', methods=['GET'])
229
- def api_scrape_sites():
230
- try:
231
- # Get query parameters
232
- query = request.args.get('query', '')
233
- num_results = int(request.args.get('num_results', 5))
234
-
235
- if not query:
236
- return jsonify({'error': 'Query parameter is required'}), 400
237
-
238
- if num_results < 1 or num_results > 10:
239
- return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
240
 
241
- # Search and scrape sites
242
- results = search_and_scrape(query, num_results)
243
-
244
- return jsonify({
245
- 'success': True,
246
- 'query': query,
247
- 'results': results
248
- })
249
-
250
- except Exception as e:
251
- return jsonify({
252
- 'success': False,
253
- 'error': str(e)
254
- }), 500
255
-
256
  if __name__ == '__main__':
257
- app.run(host='0.0.0.0', port=5000)
258
-
259
-
260
-
261
-
262
-
263
-
264
-
265
-
 
1
+ from flask import Flask, request, jsonify
2
  import requests
 
3
  import os
 
 
 
 
 
 
 
 
4
 
5
  app = Flask(__name__)
6
 
7
+ API_URL = "https://api-inference.huggingface.co/models/deepseek-ai/deepseek-coder-6.7b-base"
8
+ HEADERS = {"Authorization": f"Bearer {os.getenv('HUGGING_FACE_API_KEY')}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ @app.route('/', methods=['GET'])
11
+ def generate_text():
12
  try:
13
+ # Get the user's prompt from query parameters
14
+ user_prompt = request.args.get('prompt', 'What is the capital of France?')
15
+
16
+ # Prepare the payload
17
+ payload = {
18
+ "inputs": user_prompt,
19
+ "parameters": {
20
+ "max_new_tokens": 500,
21
+ "temperature": 0.7,
22
+ "top_p": 0.95
23
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
 
26
+ # Make request to Hugging Face API
27
+ response = requests.post(API_URL, headers=HEADERS, json=payload)
28
+ response.raise_for_status() # Raise an exception for bad status codes
29
+
30
+ # Extract the generated text from response
31
+ generated_text = response.json()[0]['generated_text']
32
+
33
+ return jsonify({'response': generated_text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  except Exception as e:
36
+ print(f"Exception occurred: {e}")
37
+ return jsonify({'error': 'An unexpected error occurred.'}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  if __name__ == '__main__':
40
+ app.run(debug=True)