Spaces:
Sleeping
Sleeping
| from bs4 import BeautifulSoup | |
| from pathlib import Path | |
| from fpdf import FPDF | |
| from urllib.parse import urlparse | |
| import openai | |
| import time | |
| import os | |
| import logging | |
| # ---------------- OPENAI ---------------- | |
| def init_openai(api_key: str): | |
| openai.api_key = api_key | |
| def clean_text_with_openai(raw_text: str) -> str: | |
| """ | |
| Bersihkan teks menggunakan OpenAI terbaru | |
| - Hilangkan navigasi, footer, duplikasi | |
| - Rangkum untuk customer service | |
| - Pertahankan semua informasi penting, termasuk link website | |
| """ | |
| prompt = f""" | |
| Tolong bersihkan dan ringkas teks berikut agar menjadi relevan untuk customer service: | |
| - Hilangkan navigasi, footer, menu, iklan, dan duplikasi | |
| - Pertahankan semua informasi penting, termasuk URL/link website, email, dan kontak resmi | |
| - Rangkum secara rapi per section dengan judul yang jelas | |
| - Hasil akhir tetap mudah dibaca dan lengkap untuk menjawab pertanyaan pengguna | |
| Teks mentah: | |
| {raw_text} | |
| """ | |
| response = openai.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "Kamu adalah asisten yang merapikan teks website agar siap digunakan dalam RAG chatbot berbasis FAISS.\n" | |
| "Instruksi:\n" | |
| "- Pisahkan konten menjadi bagian-bagian logis: Produk, Layanan, Testimoni, Kontak, Informasi Perusahaan, dsb.\n" | |
| "- Hapus elemen navigasi, menu, footer, iklan, dan duplikasi.\n" | |
| "- Jangan menghapus URL/link, email, atau informasi kontak resmi.\n" | |
| "- Format teks supaya mudah dicari oleh vector database: paragraf pendek, bullet point untuk list, judul section.\n" | |
| "- Jangan menambahkan opini, komentar, atau promosi tambahan.\n" | |
| "- Tetap pertahankan semua informasi penting agar chatbot bisa menjawab pertanyaan pengguna dengan akurat.\n" | |
| "- Output harus bersih, ringkas, lengkap, dan siap diindeks untuk retrieval." | |
| ) | |
| }, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0 | |
| ) | |
| cleaned_text = response.choices[0].message.content.strip() | |
| return cleaned_text | |
| # ---------------- LOGGING ---------------- | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S" | |
| ) | |
| # ---------------- SELENIUM ---------------- | |
| def init_driver(headless=True): | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium import webdriver | |
| options = Options() | |
| if headless: | |
| options.add_argument("--headless=new") | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--disable-dev-shm-usage") | |
| options.add_argument("--disable-gpu") | |
| options.binary_location = "/usr/bin/chromium" # path Chromium di HF Spaces | |
| service = Service("/usr/bin/chromedriver") # pastikan path benar | |
| driver = webdriver.Chrome(service=service, options=options) | |
| logging.info("WebDriver berhasil diinisialisasi") | |
| return driver | |
| def init_driver_local(headless=True): | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| options = Options() | |
| if headless: | |
| options.add_argument("--headless") | |
| options.add_argument("--disable-gpu") | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--window-size=1920,1080") | |
| driver = webdriver.Chrome(options=options) | |
| return driver | |
| def fetch_page_text(driver, url: str) -> str: | |
| logging.info("Mengambil halaman: %s", url) | |
| driver.get(url) | |
| time.sleep(2) # beri waktu agar JS selesai render | |
| soup = BeautifulSoup(driver.page_source, "html.parser") | |
| main_content = soup.find("main") or soup | |
| text = main_content.get_text(separator="\n", strip=True) | |
| logging.info("Halaman berhasil diambil (%d karakter)", len(text)) | |
| return text | |
| # ---------------- PDF ---------------- | |
| def save_to_pdf(text: str, output_file: Path): | |
| logging.info("Menyimpan teks ke PDF: %s", output_file) | |
| pdf = FPDF() | |
| pdf.set_auto_page_break(auto=True, margin=15) | |
| pdf.add_page() | |
| pdf.set_font("Arial", size=12) | |
| for line in text.split("\n"): | |
| if len(line) > 120: | |
| chunks = [line[i:i+120] for i in range(0, len(line), 120)] | |
| for chunk in chunks: | |
| pdf.multi_cell(0, 6, chunk) | |
| else: | |
| pdf.multi_cell(0, 6, line) | |
| pdf.ln(1) | |
| pdf.output(str(output_file)) | |
| logging.info("PDF berhasil disimpan: %s", output_file) | |
| def url_to_filename(url: str, folder: Path) -> Path: | |
| parsed = urlparse(url) | |
| path_safe = parsed.path.strip("/").replace("/", "_") | |
| if path_safe: | |
| safe_name = f"{parsed.netloc.replace('.', '_')}_{path_safe}.pdf" | |
| else: | |
| safe_name = f"{parsed.netloc.replace('.', '_')}.pdf" | |
| output_file = folder / safe_name | |
| logging.info("Nama file PDF untuk URL '%s': %s", url, output_file) | |
| return output_file |