Spaces:

Dede16
/

Article_Gen4

Paused

App Files Files Community

Dede16 commited on Nov 28, 2023

Commit

c99bc20

1 Parent(s): 98b7cf9

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -33

app.py CHANGED Viewed

@@ -129,7 +129,7 @@ def scrap_portal(queri):
     filter_link2 = [url for url in filter_link1 if "help" not in url]
     return filter_link2
-def clean_scrap(artikel,link,models,api_key,azure_api_base,keyword):
     new_artikel = []
     article = []
     if len(artikel) > 1:
@@ -201,7 +201,7 @@ def clean_scrap(artikel,link,models,api_key,azure_api_base,keyword):
     contents = content[1:]
     contents = [' '.join(contents).replace("article:", '').replace("Article:", '').strip()]
-    return title, judul, link, contents
 def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
     options = webdriver.ChromeOptions()
@@ -216,6 +216,7 @@ def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
     if source_type == "keyword":
         artikel =[]
         link = scrap_portal(source)
         for url in link:
             if cek_url(url):
@@ -236,16 +237,17 @@ def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
             for paragraph in containers:
                 artic=paragraph.get_text()
                 artikel.append(artic)
-            paragraf = ' '.join(artikel)
-            if len(paragraf)>= 18000:
-                part1, part2, part3, part4 = split_article(paragraf)
-                artikels = [part1, part2, part3, part4]
-            else :
-                artikels = [paragraf]
-            title, judul, url, contents = clean_scrap(artikels,url,models,api_key,azure_api_base,keyword)
-            return title, judul, url, contents
     else:
         wd.get(source)
@@ -270,9 +272,9 @@ def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
             artikels = [part1, part2, part3, part4]
         else :
             artikels = [paragraf]
-        title, judul, url, contents = clean_scrap(artikels,source,models,api_key,azure_api_base,keyword)
-        return title, judul, url, contents
 def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_api_base,replicate_key):
     title, judul, url, artikel= scrap_artikel(source_type,source, models, api_key,azure_api_base,keyword)
@@ -486,29 +488,12 @@ def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_
             os.remove(tmp_path)
     except:
         image = Image.open('botika_logo.jpeg')
-        image = image.crop((3,0,1645,1024))
-        w,h = image.size
-        new_w = int(w/1.641)
-        new_h = int(h/1.641)
-        image = image.resize((new_w, new_h),Image.ANTIALIAS)
-        tmp_path = "image.png"
-        image.save(tmp_path)
-        with open(tmp_path, 'rb') as open_file:
-            byte_img = open_file.read()
-            base64_bytes = base64.b64encode(byte_img)
-            base64_string = base64_bytes.decode('utf-8')
-            base64_string = base64.b64decode(base64_string)
-        image_data= base64_string
-        os.remove(tmp_path)
     return judul,content,image,image_data,url
 def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key):
     # try:
-    judul,kontent,gambar, image_data,url= artikel_processing(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key)
-    title = '<h1>'+judul+'</h1>'
-    desired_timezone = pytz.timezone('Asia/Jakarta')
-    current_time = datetime.datetime.now(desired_timezone)
-    Timestamp = current_time.strftime('%Y-%m-%d %H:%M:%S')
     with open("judul.txt", "w") as file:
         file.write(judul)
@@ -548,8 +533,7 @@ def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,rep
         with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
             temp_file.write(combined_data)
         repo_name = get_full_repo_name(model_id="Article_Gen4", token="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")
         file_url = upload_file(
             path_or_fileobj=temp_file.name,
@@ -561,6 +545,7 @@ def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,rep
         status = "<h3>Berhasil Generate Artikel</h3>"
         time.sleep(60)
         return status,gambar
     else:
         with open('log_activity.txt', 'r') as file:
             existing_data = file.read()

     filter_link2 = [url for url in filter_link1 if "help" not in url]
     return filter_link2
+def clean_scrap(artikel,models,api_key,azure_api_base,keyword):
     new_artikel = []
     article = []
     if len(artikel) > 1:
     contents = content[1:]
     contents = [' '.join(contents).replace("article:", '').replace("Article:", '').strip()]
+    return title, judul, contents
 def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
     options = webdriver.ChromeOptions()
     if source_type == "keyword":
         artikel =[]
+        URL = None
         link = scrap_portal(source)
         for url in link:
             if cek_url(url):
             for paragraph in containers:
                 artic=paragraph.get_text()
                 artikel.append(artic)
+            URL = URL + url
+        paragraf = ' '.join(artikel)
+        if len(paragraf)>= 18000:
+            part1, part2, part3, part4 = split_article(paragraf)
+            artikels = [part1, part2, part3, part4]
+        else :
+            artikels = [paragraf]
+        title, judul, contents = clean_scrap(artikels,models,api_key,azure_api_base,keyword)
+        return title, judul, URL, contents
     else:
         wd.get(source)
             artikels = [part1, part2, part3, part4]
         else :
             artikels = [paragraf]
+        title, judul, contents = clean_scrap(artikels,models,api_key,azure_api_base,keyword)
+        return title, judul, source, contents
 def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_api_base,replicate_key):
     title, judul, url, artikel= scrap_artikel(source_type,source, models, api_key,azure_api_base,keyword)
             os.remove(tmp_path)
     except:
         image = Image.open('botika_logo.jpeg')
     return judul,content,image,image_data,url
 def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key):
     # try:
+    judul,kontent,gambar,image_data,url= artikel_processing(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key)
     with open("judul.txt", "w") as file:
         file.write(judul)
         with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
             temp_file.write(combined_data)
         repo_name = get_full_repo_name(model_id="Article_Gen4", token="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")
         file_url = upload_file(
             path_or_fileobj=temp_file.name,
         status = "<h3>Berhasil Generate Artikel</h3>"
         time.sleep(60)
         return status,gambar
     else:
         with open('log_activity.txt', 'r') as file:
             existing_data = file.read()