Navy
scarping update
b720cc9
from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse
from pathlib import Path
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from rag import build_vector_index, query_from_vector, PDF_DIR
from dotenv import load_dotenv
from utils import *
import threading
import os
import traceback
import logging
# ------------------ INIT ------------------
app = FastAPI(title="Virtual Assistant Chatbot API", version="1.0")
# ------------------ CORS ------------------
origins = [
"https://chabot.demo.viproject.net",
"chabot.demo.viproject.net"
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
os.makedirs(PDF_DIR, exist_ok=True)
OUTPUT_FOLDER = Path("documents")
OUTPUT_FOLDER.mkdir(exist_ok=True)
# ------------------ load env ------------------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
init_openai(OPENAI_API_KEY)
# ------------------ HELPER ------------------
def clear_documents_folder():
"""Hapus semua file di folder documents"""
for file in OUTPUT_FOLDER.iterdir():
if file.is_file():
file.unlink()
# ------------------ ROUTES ------------------
@app.get("/")
def root():
"""Cek status API"""
return {"message": "Virtual Assistant API aktif dan siap digunakan!"}
@app.post("/build-knowledge")
async def build_vector_db():
"""Bangun FAISS vector database dari semua PDF"""
try:
result = build_vector_index()
return JSONResponse(result)
except Exception as e:
tb = traceback.format_exc()
logger.error(f"/build_vector_db error: {e}\n{tb}")
return JSONResponse({"error": str(e), "traceback": tb}, status_code=500)
@app.post("/web-scraping/")
async def scrape_urls(filename: str = Form(...), urls: str = Form(...), location: str = "server"):
"""
Ambil semua URL, gabungkan teks → bersihkan → satu PDF utuh
"""
# Hapus semua file lama
clear_documents_folder()
# Parse URL
url_list = [u.strip() for u in urls.split(",") if u.strip()]
extracted_texts = [None] * len(url_list)
threads = []
# STEP 1: Ekstraksi halaman
def worker_extract(i, url):
driver = init_driver_local() if location == "local" else init_driver()
try:
extracted_texts[i] = fetch_page_text(driver, url)
finally:
driver.quit()
for i, url in enumerate(url_list):
t = threading.Thread(target=worker_extract, args=(i, url))
t.start()
threads.append(t)
for t in threads:
t.join()
# STEP 2: Gabungkan semua teks
combined_text = ""
for url, text in zip(url_list, extracted_texts):
combined_text += f"===== URL: {url} =====\n{text}\n\n"
# STEP 3: Bersihkan dengan OpenAI
cleaned_text = clean_text_with_openai(combined_text)
# STEP 4: Simpan PDF
output_file = OUTPUT_FOLDER / f"{filename}.pdf"
save_to_pdf(cleaned_text, output_file)
return JSONResponse({"success": True, "pdf_file": str(output_file)})
@app.post("/ask")
async def ask_question(
question: str = Form(...),
session_id: str = Form(None),
):
"""Ajukan pertanyaan ke dokumen yang sudah diindeks dengan session chat"""
try:
result = query_from_vector(
query=question,
session_id=session_id,
)
return JSONResponse(result)
except Exception as e:
tb = traceback.format_exc()
return JSONResponse({"error": str(e), "traceback": tb}, status_code=500)