File size: 3,715 Bytes
5de9ee6
 
94c1837
 
5de9ee6
 
94c1837
 
 
5de9ee6
94c1837
5de9ee6
 
 
 
 
 
 
 
 
94c1837
 
5de9ee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94c1837
 
 
 
 
 
 
 
 
b7caa49
94c1837
 
 
 
 
 
5de9ee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94c1837
b720cc9
94c1837
b7caa49
94c1837
 
 
 
 
 
b7caa49
94c1837
 
b7caa49
 
b720cc9
b7caa49
 
 
 
94c1837
 
b7caa49
94c1837
 
 
 
 
 
b7caa49
 
 
 
 
 
 
 
 
 
 
 
 
94c1837
 
5de9ee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse
from pathlib import Path
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from rag import build_vector_index, query_from_vector, PDF_DIR
from dotenv import load_dotenv

from utils import *

import threading
import os
import traceback
import logging

# ------------------ INIT ------------------
app = FastAPI(title="Virtual Assistant Chatbot API", version="1.0")

# ------------------ CORS ------------------
origins = [
    "https://chabot.demo.viproject.net",
    "chabot.demo.viproject.net"
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

os.makedirs(PDF_DIR, exist_ok=True)

OUTPUT_FOLDER = Path("documents")
OUTPUT_FOLDER.mkdir(exist_ok=True)

# ------------------ load env ------------------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
init_openai(OPENAI_API_KEY)


# ------------------ HELPER ------------------        
def clear_documents_folder():
    """Hapus semua file di folder documents"""
    for file in OUTPUT_FOLDER.iterdir():
        if file.is_file():
            file.unlink()

# ------------------ ROUTES ------------------
@app.get("/")
def root():
    """Cek status API"""
    return {"message": "Virtual Assistant API aktif dan siap digunakan!"}


@app.post("/build-knowledge")
async def build_vector_db():
    """Bangun FAISS vector database dari semua PDF"""
    try:
        result = build_vector_index()
        return JSONResponse(result)
    except Exception as e:
        tb = traceback.format_exc()
        logger.error(f"/build_vector_db error: {e}\n{tb}")
        return JSONResponse({"error": str(e), "traceback": tb}, status_code=500)


@app.post("/web-scraping/")
async def scrape_urls(filename: str = Form(...), urls: str = Form(...), location: str = "server"):
    """
    Ambil semua URL, gabungkan teks → bersihkan → satu PDF utuh
    """
    # Hapus semua file lama
    clear_documents_folder()

    # Parse URL
    url_list = [u.strip() for u in urls.split(",") if u.strip()]
    extracted_texts = [None] * len(url_list)
    threads = []

    # STEP 1: Ekstraksi halaman
    def worker_extract(i, url):
        driver = init_driver_local() if location == "local" else init_driver()
        try:
            extracted_texts[i] = fetch_page_text(driver, url)
        finally:
            driver.quit()

    for i, url in enumerate(url_list):
        t = threading.Thread(target=worker_extract, args=(i, url))
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    # STEP 2: Gabungkan semua teks
    combined_text = ""
    for url, text in zip(url_list, extracted_texts):
        combined_text += f"===== URL: {url} =====\n{text}\n\n"

    # STEP 3: Bersihkan dengan OpenAI
    cleaned_text = clean_text_with_openai(combined_text)

    # STEP 4: Simpan PDF
    output_file = OUTPUT_FOLDER / f"{filename}.pdf"
    save_to_pdf(cleaned_text, output_file)

    return JSONResponse({"success": True, "pdf_file": str(output_file)})
    

@app.post("/ask")
async def ask_question(
    question: str = Form(...),
    session_id: str = Form(None),
):
    """Ajukan pertanyaan ke dokumen yang sudah diindeks dengan session chat"""
    try:
        result = query_from_vector(
            query=question,
            session_id=session_id,
        )
        return JSONResponse(result)
    except Exception as e:
        tb = traceback.format_exc()
        return JSONResponse({"error": str(e), "traceback": tb}, status_code=500)