Navy commited on
Commit
b7caa49
·
1 Parent(s): a182d91

scarping update

Browse files
Files changed (2) hide show
  1. main.py +25 -21
  2. utils.py +18 -3
main.py CHANGED
@@ -44,18 +44,7 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
44
  init_openai(OPENAI_API_KEY)
45
 
46
 
47
- # ------------------ HELPER ------------------
48
- def process_url(url: str):
49
- driver = init_driver()
50
- try:
51
- raw_text = fetch_page_text(driver, url)
52
- cleaned_text = clean_text_with_openai(raw_text)
53
- pdf_file = url_to_filename(url, OUTPUT_FOLDER)
54
- save_to_pdf(cleaned_text, pdf_file)
55
- return str(pdf_file)
56
- finally:
57
- driver.quit()
58
-
59
  def clear_documents_folder():
60
  """Hapus semua file di folder documents"""
61
  for file in OUTPUT_FOLDER.iterdir():
@@ -82,32 +71,47 @@ async def build_vector_db():
82
 
83
 
84
  @app.post("/web-scraping/")
85
- async def scrape_urls(urls: str = Form(...)):
86
  """
87
- Terima daftar URL dipisah koma, setiap URL menjadi 1 PDF.
88
- Sebelum memulai, hapus semua file di folder documents.
89
  """
90
  # Hapus semua file lama
91
  clear_documents_folder()
92
 
93
  # Parse URL
94
  url_list = [u.strip() for u in urls.split(",") if u.strip()]
95
- pdf_files = [None] * len(url_list)
96
-
97
  threads = []
98
 
99
- def worker(i, url):
100
- pdf_files[i] = process_url(url)
 
 
 
 
 
101
 
102
  for i, url in enumerate(url_list):
103
- t = threading.Thread(target=worker, args=(i, url))
104
  t.start()
105
  threads.append(t)
106
 
107
  for t in threads:
108
  t.join()
109
 
110
- return JSONResponse({"success": True, "pdf_files": pdf_files})
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
 
113
  @app.post("/ask")
 
44
  init_openai(OPENAI_API_KEY)
45
 
46
 
47
+ # ------------------ HELPER ------------------
 
 
 
 
 
 
 
 
 
 
 
48
  def clear_documents_folder():
49
  """Hapus semua file di folder documents"""
50
  for file in OUTPUT_FOLDER.iterdir():
 
71
 
72
 
73
  @app.post("/web-scraping/")
74
+ async def scrape_urls(filename: str = Form(...), urls: str = Form(...), select: str = "huggingface"):
75
  """
76
+ Ambil semua URL, gabungkan teks bersihkan satu PDF utuh
 
77
  """
78
  # Hapus semua file lama
79
  clear_documents_folder()
80
 
81
  # Parse URL
82
  url_list = [u.strip() for u in urls.split(",") if u.strip()]
83
+ extracted_texts = [None] * len(url_list)
 
84
  threads = []
85
 
86
+ # STEP 1: Ekstraksi halaman
87
+ def worker_extract(i, url):
88
+ driver = init_driver_local() if select == "local" else init_driver()
89
+ try:
90
+ extracted_texts[i] = fetch_page_text(driver, url)
91
+ finally:
92
+ driver.quit()
93
 
94
  for i, url in enumerate(url_list):
95
+ t = threading.Thread(target=worker_extract, args=(i, url))
96
  t.start()
97
  threads.append(t)
98
 
99
  for t in threads:
100
  t.join()
101
 
102
+ # STEP 2: Gabungkan semua teks
103
+ combined_text = ""
104
+ for url, text in zip(url_list, extracted_texts):
105
+ combined_text += f"===== URL: {url} =====\n{text}\n\n"
106
+
107
+ # STEP 3: Bersihkan dengan OpenAI
108
+ cleaned_text = clean_text_with_openai(combined_text)
109
+
110
+ # STEP 4: Simpan PDF
111
+ output_file = OUTPUT_FOLDER / f"{filename}.pdf"
112
+ save_to_pdf(cleaned_text, output_file)
113
+
114
+ return JSONResponse({"success": True, "pdf_file": str(output_file)})
115
 
116
 
117
  @app.post("/ask")
utils.py CHANGED
@@ -1,6 +1,3 @@
1
- from selenium.webdriver.chrome.service import Service
2
- from selenium.webdriver.chrome.options import Options
3
- from selenium import webdriver
4
  from bs4 import BeautifulSoup
5
  from pathlib import Path
6
  from fpdf import FPDF
@@ -60,6 +57,10 @@ logging.basicConfig(
60
 
61
  # ---------------- SELENIUM ----------------
62
  def init_driver(headless=True):
 
 
 
 
63
  options = Options()
64
  if headless:
65
  options.add_argument("--headless=new")
@@ -73,6 +74,20 @@ def init_driver(headless=True):
73
  logging.info("WebDriver berhasil diinisialisasi")
74
  return driver
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def fetch_page_text(driver, url: str) -> str:
77
  logging.info("Mengambil halaman: %s", url)
78
  driver.get(url)
 
 
 
 
1
  from bs4 import BeautifulSoup
2
  from pathlib import Path
3
  from fpdf import FPDF
 
57
 
58
  # ---------------- SELENIUM ----------------
59
  def init_driver(headless=True):
60
+ from selenium.webdriver.chrome.service import Service
61
+ from selenium.webdriver.chrome.options import Options
62
+ from selenium import webdriver
63
+
64
  options = Options()
65
  if headless:
66
  options.add_argument("--headless=new")
 
74
  logging.info("WebDriver berhasil diinisialisasi")
75
  return driver
76
 
77
+
78
+ def init_driver_local(headless=True):
79
+ from selenium import webdriver
80
+ from selenium.webdriver.chrome.options import Options
81
+
82
+ options = Options()
83
+ if headless:
84
+ options.add_argument("--headless")
85
+ options.add_argument("--disable-gpu")
86
+ options.add_argument("--no-sandbox")
87
+ options.add_argument("--window-size=1920,1080")
88
+ driver = webdriver.Chrome(options=options)
89
+ return driver
90
+
91
  def fetch_page_text(driver, url: str) -> str:
92
  logging.info("Mengambil halaman: %s", url)
93
  driver.get(url)