RAG_pxpxpxpx / app.py
syafiqq02's picture
streamlit
8fb48a4
import os
import pdfplumber
import docx
import chromadb
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.responses import JSONResponse
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from typing import List
# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
raise Exception("❌ GROQ API Key tidak ditemukan! Pastikan sudah menyimpan API Key di file .env.")
# Inisialisasi FastAPI
app = FastAPI(title="Document Chat API (FastAPI + ChromaDB + Groq)")
# Inisialisasi ChromaDB (database vektor lokal)
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="document_chunks")
# Inisialisasi model embedding
embedding_model = SentenceTransformer("sangmini/msmarco-cotmae-MiniLM-L12_en-ko-ja")
# Inisialisasi model QA dari Groq
chat_groq = ChatGroq(api_key=GROQ_API_KEY, model_name="qwen-2.5-coder-32b")
# Fungsi untuk ekstraksi teks dari PDF/DOCX
def extract_text_from_file(file_path: str, file_type: str) -> str:
text = ""
if file_type == "pdf":
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
elif file_type == "docx":
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs])
return text
# Fungsi untuk menyimpan teks ke ChromaDB
def store_document(file_path: str, file_type: str):
text = extract_text_from_file(file_path, file_type)
if not text:
return "❌ Gagal mengekstrak teks dari file."
# Split teks menjadi chunk kecil
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(text)
# Buat embeddings untuk setiap chunk dan simpan ke ChromaDB
for i, chunk in enumerate(chunks):
embedding = embedding_model.encode(chunk).tolist()
collection.add(
ids=[f"{file_path}_{i}"],
embeddings=[embedding],
metadatas=[{"text": chunk}]
)
return "βœ… Dokumen berhasil disimpan di ChromaDB!"
# Fungsi untuk mencari chunk teks yang paling relevan
def search_relevant_text(query: str, top_k: int = 3) -> str:
query_vector = embedding_model.encode(query).tolist()
results = collection.query(query_embeddings=[query_vector], n_results=top_k)
# Ambil teks dari metadata hasil pencarian
retrieved_texts = [match["text"] for match in results["metadatas"][0]]
return "\n\n".join(retrieved_texts)
# Endpoint utama untuk upload file dan bertanya dalam satu proses
@app.post("/process")
async def process_document_and_ask_question(
file: UploadFile = File(...),
question: str = Form(...)
):
# Simpan file ke sistem sementara
file_ext = file.filename.split(".")[-1].lower()
if file_ext not in ["pdf", "docx"]:
raise HTTPException(status_code=400, detail="❌ Hanya file PDF atau DOCX yang didukung.")
file_path = f"./temp_{file.filename}"
with open(file_path, "wb") as f:
f.write(await file.read())
# Proses dokumen dan simpan ke ChromaDB
doc_status = store_document(file_path, file_ext)
# Hapus file setelah diproses
os.remove(file_path)
# Cari teks yang relevan di ChromaDB
context = search_relevant_text(question, top_k=3)
# Kirim ke model QA Groq
prompt = f"Berikut adalah informasi dari dokumen:\n\n{context}\n\nPertanyaan: {question}\nJawaban:"
response = chat_groq.invoke(prompt)
# Konversi AIMessage ke string menggunakan `.content`
response_text = response.content if hasattr(response, "content") else str(response)
return JSONResponse(content={
"status": doc_status,
"question": question,
"answer": response_text,
})