from fastapi import FastAPI, Form from fastapi.responses import JSONResponse from pathlib import Path from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware from rag import build_vector_index, query_from_vector, PDF_DIR from dotenv import load_dotenv from utils import * import threading import os import traceback import logging # ------------------ INIT ------------------ app = FastAPI(title="Virtual Assistant Chatbot API", version="1.0") # ------------------ CORS ------------------ origins = [ "https://chabot.demo.viproject.net", "chabot.demo.viproject.net" ] app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) os.makedirs(PDF_DIR, exist_ok=True) OUTPUT_FOLDER = Path("documents") OUTPUT_FOLDER.mkdir(exist_ok=True) # ------------------ load env ------------------ load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") init_openai(OPENAI_API_KEY) # ------------------ HELPER ------------------ def clear_documents_folder(): """Hapus semua file di folder documents""" for file in OUTPUT_FOLDER.iterdir(): if file.is_file(): file.unlink() # ------------------ ROUTES ------------------ @app.get("/") def root(): """Cek status API""" return {"message": "Virtual Assistant API aktif dan siap digunakan!"} @app.post("/build-knowledge") async def build_vector_db(): """Bangun FAISS vector database dari semua PDF""" try: result = build_vector_index() return JSONResponse(result) except Exception as e: tb = traceback.format_exc() logger.error(f"/build_vector_db error: {e}\n{tb}") return JSONResponse({"error": str(e), "traceback": tb}, status_code=500) @app.post("/web-scraping/") async def scrape_urls(filename: str = Form(...), urls: str = Form(...), location: str = "server"): """ Ambil semua URL, gabungkan teks → bersihkan → satu PDF utuh """ # Hapus semua file lama clear_documents_folder() # Parse URL url_list = [u.strip() for u in urls.split(",") if u.strip()] extracted_texts = [None] * len(url_list) threads = [] # STEP 1: Ekstraksi halaman def worker_extract(i, url): driver = init_driver_local() if location == "local" else init_driver() try: extracted_texts[i] = fetch_page_text(driver, url) finally: driver.quit() for i, url in enumerate(url_list): t = threading.Thread(target=worker_extract, args=(i, url)) t.start() threads.append(t) for t in threads: t.join() # STEP 2: Gabungkan semua teks combined_text = "" for url, text in zip(url_list, extracted_texts): combined_text += f"===== URL: {url} =====\n{text}\n\n" # STEP 3: Bersihkan dengan OpenAI cleaned_text = clean_text_with_openai(combined_text) # STEP 4: Simpan PDF output_file = OUTPUT_FOLDER / f"{filename}.pdf" save_to_pdf(cleaned_text, output_file) return JSONResponse({"success": True, "pdf_file": str(output_file)}) @app.post("/ask") async def ask_question( question: str = Form(...), session_id: str = Form(None), ): """Ajukan pertanyaan ke dokumen yang sudah diindeks dengan session chat""" try: result = query_from_vector( query=question, session_id=session_id, ) return JSONResponse(result) except Exception as e: tb = traceback.format_exc() return JSONResponse({"error": str(e), "traceback": tb}, status_code=500)