Spaces:
Build error
Build error
| import os | |
| import pdfplumber | |
| import docx | |
| import chromadb | |
| from fastapi import FastAPI, UploadFile, File, Form, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from dotenv import load_dotenv | |
| from sentence_transformers import SentenceTransformer | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_groq import ChatGroq | |
| from typing import List | |
| # Load environment variables | |
| load_dotenv() | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| if not GROQ_API_KEY: | |
| raise Exception("β GROQ API Key tidak ditemukan! Pastikan sudah menyimpan API Key di file .env.") | |
| # Inisialisasi FastAPI | |
| app = FastAPI(title="Document Chat API (FastAPI + ChromaDB + Groq)") | |
| # Inisialisasi ChromaDB (database vektor lokal) | |
| chroma_client = chromadb.PersistentClient(path="./chroma_db") | |
| collection = chroma_client.get_or_create_collection(name="document_chunks") | |
| # Inisialisasi model embedding | |
| embedding_model = SentenceTransformer("sangmini/msmarco-cotmae-MiniLM-L12_en-ko-ja") | |
| # Inisialisasi model QA dari Groq | |
| chat_groq = ChatGroq(api_key=GROQ_API_KEY, model_name="qwen-2.5-coder-32b") | |
| # Fungsi untuk ekstraksi teks dari PDF/DOCX | |
| def extract_text_from_file(file_path: str, file_type: str) -> str: | |
| text = "" | |
| if file_type == "pdf": | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() or "" | |
| elif file_type == "docx": | |
| doc = docx.Document(file_path) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| return text | |
| # Fungsi untuk menyimpan teks ke ChromaDB | |
| def store_document(file_path: str, file_type: str): | |
| text = extract_text_from_file(file_path, file_type) | |
| if not text: | |
| return "β Gagal mengekstrak teks dari file." | |
| # Split teks menjadi chunk kecil | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| chunks = text_splitter.split_text(text) | |
| # Buat embeddings untuk setiap chunk dan simpan ke ChromaDB | |
| for i, chunk in enumerate(chunks): | |
| embedding = embedding_model.encode(chunk).tolist() | |
| collection.add( | |
| ids=[f"{file_path}_{i}"], | |
| embeddings=[embedding], | |
| metadatas=[{"text": chunk}] | |
| ) | |
| return "β Dokumen berhasil disimpan di ChromaDB!" | |
| # Fungsi untuk mencari chunk teks yang paling relevan | |
| def search_relevant_text(query: str, top_k: int = 3) -> str: | |
| query_vector = embedding_model.encode(query).tolist() | |
| results = collection.query(query_embeddings=[query_vector], n_results=top_k) | |
| # Ambil teks dari metadata hasil pencarian | |
| retrieved_texts = [match["text"] for match in results["metadatas"][0]] | |
| return "\n\n".join(retrieved_texts) | |
| # Endpoint utama untuk upload file dan bertanya dalam satu proses | |
| async def process_document_and_ask_question( | |
| file: UploadFile = File(...), | |
| question: str = Form(...) | |
| ): | |
| # Simpan file ke sistem sementara | |
| file_ext = file.filename.split(".")[-1].lower() | |
| if file_ext not in ["pdf", "docx"]: | |
| raise HTTPException(status_code=400, detail="β Hanya file PDF atau DOCX yang didukung.") | |
| file_path = f"./temp_{file.filename}" | |
| with open(file_path, "wb") as f: | |
| f.write(await file.read()) | |
| # Proses dokumen dan simpan ke ChromaDB | |
| doc_status = store_document(file_path, file_ext) | |
| # Hapus file setelah diproses | |
| os.remove(file_path) | |
| # Cari teks yang relevan di ChromaDB | |
| context = search_relevant_text(question, top_k=3) | |
| # Kirim ke model QA Groq | |
| prompt = f"Berikut adalah informasi dari dokumen:\n\n{context}\n\nPertanyaan: {question}\nJawaban:" | |
| response = chat_groq.invoke(prompt) | |
| # Konversi AIMessage ke string menggunakan `.content` | |
| response_text = response.content if hasattr(response, "content") else str(response) | |
| return JSONResponse(content={ | |
| "status": doc_status, | |
| "question": question, | |
| "answer": response_text, | |
| }) |