Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -129,7 +129,7 @@ def scrap_portal(queri):
|
|
| 129 |
filter_link2 = [url for url in filter_link1 if "help" not in url]
|
| 130 |
return filter_link2
|
| 131 |
|
| 132 |
-
def clean_scrap(artikel,
|
| 133 |
new_artikel = []
|
| 134 |
article = []
|
| 135 |
if len(artikel) > 1:
|
|
@@ -201,7 +201,7 @@ def clean_scrap(artikel,link,models,api_key,azure_api_base,keyword):
|
|
| 201 |
contents = content[1:]
|
| 202 |
contents = [' '.join(contents).replace("article:", '').replace("Article:", '').strip()]
|
| 203 |
|
| 204 |
-
return title, judul,
|
| 205 |
|
| 206 |
def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
| 207 |
options = webdriver.ChromeOptions()
|
|
@@ -216,6 +216,7 @@ def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
|
| 216 |
|
| 217 |
if source_type == "keyword":
|
| 218 |
artikel =[]
|
|
|
|
| 219 |
link = scrap_portal(source)
|
| 220 |
for url in link:
|
| 221 |
if cek_url(url):
|
|
@@ -236,16 +237,17 @@ def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
|
| 236 |
for paragraph in containers:
|
| 237 |
artic=paragraph.get_text()
|
| 238 |
artikel.append(artic)
|
|
|
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
|
| 248 |
-
|
| 249 |
|
| 250 |
else:
|
| 251 |
wd.get(source)
|
|
@@ -270,9 +272,9 @@ def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
|
| 270 |
artikels = [part1, part2, part3, part4]
|
| 271 |
else :
|
| 272 |
artikels = [paragraf]
|
| 273 |
-
title, judul,
|
| 274 |
|
| 275 |
-
return title, judul,
|
| 276 |
|
| 277 |
def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_api_base,replicate_key):
|
| 278 |
title, judul, url, artikel= scrap_artikel(source_type,source, models, api_key,azure_api_base,keyword)
|
|
@@ -486,29 +488,12 @@ def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_
|
|
| 486 |
os.remove(tmp_path)
|
| 487 |
except:
|
| 488 |
image = Image.open('botika_logo.jpeg')
|
| 489 |
-
|
| 490 |
-
w,h = image.size
|
| 491 |
-
new_w = int(w/1.641)
|
| 492 |
-
new_h = int(h/1.641)
|
| 493 |
-
image = image.resize((new_w, new_h),Image.ANTIALIAS)
|
| 494 |
-
tmp_path = "image.png"
|
| 495 |
-
image.save(tmp_path)
|
| 496 |
-
with open(tmp_path, 'rb') as open_file:
|
| 497 |
-
byte_img = open_file.read()
|
| 498 |
-
base64_bytes = base64.b64encode(byte_img)
|
| 499 |
-
base64_string = base64_bytes.decode('utf-8')
|
| 500 |
-
base64_string = base64.b64decode(base64_string)
|
| 501 |
-
image_data= base64_string
|
| 502 |
-
os.remove(tmp_path)
|
| 503 |
return judul,content,image,image_data,url
|
| 504 |
|
| 505 |
def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key):
|
| 506 |
# try:
|
| 507 |
-
judul,kontent,gambar,
|
| 508 |
-
title = '<h1>'+judul+'</h1>'
|
| 509 |
-
desired_timezone = pytz.timezone('Asia/Jakarta')
|
| 510 |
-
current_time = datetime.datetime.now(desired_timezone)
|
| 511 |
-
Timestamp = current_time.strftime('%Y-%m-%d %H:%M:%S')
|
| 512 |
|
| 513 |
with open("judul.txt", "w") as file:
|
| 514 |
file.write(judul)
|
|
@@ -548,8 +533,7 @@ def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,rep
|
|
| 548 |
|
| 549 |
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
|
| 550 |
temp_file.write(combined_data)
|
| 551 |
-
|
| 552 |
-
|
| 553 |
repo_name = get_full_repo_name(model_id="Article_Gen4", token="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")
|
| 554 |
file_url = upload_file(
|
| 555 |
path_or_fileobj=temp_file.name,
|
|
@@ -561,6 +545,7 @@ def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,rep
|
|
| 561 |
status = "<h3>Berhasil Generate Artikel</h3>"
|
| 562 |
time.sleep(60)
|
| 563 |
return status,gambar
|
|
|
|
| 564 |
else:
|
| 565 |
with open('log_activity.txt', 'r') as file:
|
| 566 |
existing_data = file.read()
|
|
|
|
| 129 |
filter_link2 = [url for url in filter_link1 if "help" not in url]
|
| 130 |
return filter_link2
|
| 131 |
|
| 132 |
+
def clean_scrap(artikel,models,api_key,azure_api_base,keyword):
|
| 133 |
new_artikel = []
|
| 134 |
article = []
|
| 135 |
if len(artikel) > 1:
|
|
|
|
| 201 |
contents = content[1:]
|
| 202 |
contents = [' '.join(contents).replace("article:", '').replace("Article:", '').strip()]
|
| 203 |
|
| 204 |
+
return title, judul, contents
|
| 205 |
|
| 206 |
def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
| 207 |
options = webdriver.ChromeOptions()
|
|
|
|
| 216 |
|
| 217 |
if source_type == "keyword":
|
| 218 |
artikel =[]
|
| 219 |
+
URL = None
|
| 220 |
link = scrap_portal(source)
|
| 221 |
for url in link:
|
| 222 |
if cek_url(url):
|
|
|
|
| 237 |
for paragraph in containers:
|
| 238 |
artic=paragraph.get_text()
|
| 239 |
artikel.append(artic)
|
| 240 |
+
URL = URL + url
|
| 241 |
|
| 242 |
+
paragraf = ' '.join(artikel)
|
| 243 |
+
if len(paragraf)>= 18000:
|
| 244 |
+
part1, part2, part3, part4 = split_article(paragraf)
|
| 245 |
+
artikels = [part1, part2, part3, part4]
|
| 246 |
+
else :
|
| 247 |
+
artikels = [paragraf]
|
| 248 |
+
title, judul, contents = clean_scrap(artikels,models,api_key,azure_api_base,keyword)
|
| 249 |
|
| 250 |
+
return title, judul, URL, contents
|
| 251 |
|
| 252 |
else:
|
| 253 |
wd.get(source)
|
|
|
|
| 272 |
artikels = [part1, part2, part3, part4]
|
| 273 |
else :
|
| 274 |
artikels = [paragraf]
|
| 275 |
+
title, judul, contents = clean_scrap(artikels,models,api_key,azure_api_base,keyword)
|
| 276 |
|
| 277 |
+
return title, judul, source, contents
|
| 278 |
|
| 279 |
def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_api_base,replicate_key):
|
| 280 |
title, judul, url, artikel= scrap_artikel(source_type,source, models, api_key,azure_api_base,keyword)
|
|
|
|
| 488 |
os.remove(tmp_path)
|
| 489 |
except:
|
| 490 |
image = Image.open('botika_logo.jpeg')
|
| 491 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
return judul,content,image,image_data,url
|
| 493 |
|
| 494 |
def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key):
|
| 495 |
# try:
|
| 496 |
+
judul,kontent,gambar,image_data,url= artikel_processing(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
|
| 498 |
with open("judul.txt", "w") as file:
|
| 499 |
file.write(judul)
|
|
|
|
| 533 |
|
| 534 |
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
|
| 535 |
temp_file.write(combined_data)
|
| 536 |
+
|
|
|
|
| 537 |
repo_name = get_full_repo_name(model_id="Article_Gen4", token="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")
|
| 538 |
file_url = upload_file(
|
| 539 |
path_or_fileobj=temp_file.name,
|
|
|
|
| 545 |
status = "<h3>Berhasil Generate Artikel</h3>"
|
| 546 |
time.sleep(60)
|
| 547 |
return status,gambar
|
| 548 |
+
|
| 549 |
else:
|
| 550 |
with open('log_activity.txt', 'r') as file:
|
| 551 |
existing_data = file.read()
|