ITBOT / app.py
cheryl19's picture
Update app.py
7eb60d5 verified
import os
import torch
import faiss
import numpy as np
import gradio as gr
import re # Import regex for advanced text cleaning
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.preprocessing import normalize
# === 1. Load IndoBERT for embedding ===
embed_tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
embed_model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
def get_embedding(text):
"""Generates an embedding for the given text using IndoBERT."""
inputs = embed_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = embed_model(**inputs)
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
return embedding
# === 2. Load GPT2 LLM Bahasa Indonesia ===
# Initialize the text generation pipeline with the GPT2 model
llm = pipeline("text-generation", model="IzzulGod/GPT2-Indo-chat-tuned")
# === 3. Load Documents & Build FAISS Index ===
DATA_DIR = "data"
doc_chunks = {} # Stores chunks of documents: mata_kuliah -> [list of text chunks]
doc_indexes = {} # Stores FAISS indexes for each mata_kuliah: mata_kuliah -> FAISS index
# Function to clean raw text from irrelevant patterns (moved here for clarity)
def clean_document_text(text: str) -> str:
"""
Cleans document text by removing common irrelevant patterns like URLs, tags,
footers, headers, and excessive whitespace. This is crucial for accurate retrieval.
"""
# Remove URLs
text = re.sub(r'http\S+|www\S+', '', text, flags=re.MULTILINE)
# Remove common irrelevant lines (e.g., source, tags, page numbers, navigation)
text = re.sub(r'Sumber:.*', '', text)
text = re.sub(r'Tags:.*', '', text)
text = re.sub(r'^\d+\s*pemikiran pada “.*”', '', text, flags=re.MULTILINE)
text = re.sub(r'←.*→', '', text)
text = re.sub(r'^\d+$', '', text, flags=re.MULTILINE) # Remove lines that are just numbers (like page numbers)
# Remove excessive spaces and normalize newlines
text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space
text = re.sub(r'\n+', '\n', text).strip() # Replace multiple newlines with single newline
return text
# Process each text file in the data directory
for fname in os.listdir(DATA_DIR):
if fname.endswith(".txt"):
matkul = os.path.splitext(fname)[0].upper() # Extract subject name from filename
with open(os.path.join(DATA_DIR, fname), encoding='utf-8') as f:
raw_text = f.read()
# Apply cleaning BEFORE chunking and embedding
cleaned_text = clean_document_text(raw_text)
# Split document into chunks. Adjust chunk size (e.g., 300-700) based on content.
# A smaller chunk size (e.g., 300) might be better if you want very concise answers
# and want to ensure a single relevant sentence isn't split across chunks.
chunks = [cleaned_text[i:i+300] for i in range(0, len(cleaned_text), 300)]
doc_chunks[matkul] = chunks
# Generate embeddings for all chunks and normalize them
embeddings = np.array([get_embedding(chunk) for chunk in chunks])
embeddings = normalize(embeddings) # Normalize embeddings for better FAISS performance
# Create a FAISS index for efficient similarity search
index = faiss.IndexFlatL2(embeddings.shape[1]) # L2 distance for similarity
index.add(embeddings) # Add embeddings to the index
doc_indexes[matkul] = index
# === 4. RAG Function ===
def rag_chat(matkul: str, question: str) -> str:
"""
Retrieves relevant context and generates a concise, relevant answer using LLM.
Args:
matkul (str): The selected subject (mata kuliah).
question (str): The user's question.
Returns:
str: The generated answer, cleaned and deduplicated to be very concise.
"""
if matkul not in doc_indexes:
return "Mata kuliah tidak ditemukan."
# Generate embedding for the user's question
query_embed = get_embedding(question)
query_embed = normalize(query_embed.reshape(1, -1))
# Search for top-k (e.g., 3 or 5) most similar chunks in the FAISS index
# K=5 is a good balance for capturing relevant context.
D, I = doc_indexes[matkul].search(query_embed, k=5)
context = "\n".join([doc_chunks[matkul][i] for i in I[0]])
# --- Prompt Optimized for Extreme Conciseness and Directness ---
# The prompt explicitly asks for ONLY the direct answer and nothing else.
# It strongly discourages extra text and encourages directness.
prompt = f"""Sebagai asisten AI, berikan jawaban **paling singkat dan langsung** untuk pertanyaan berikut.
Gunakan **hanya informasi dari bagian "Informasi Relevan"** di bawah ini.
Jangan mengulang pertanyaan, menambahkan kalimat pengantar/penutup, atau informasi lain.
Fokus pada inti definisi atau penjelasan yang diminta. Jika informasi tidak cukup, jawab "Informasi tidak ditemukan."
Informasi Relevan dari mata kuliah {matkul}:
{context}
Pertanyaan: {question}
Jawaban:"""
# --- Text Generation Parameters Optimized for Conciseness ---
# `max_new_tokens` is significantly reduced.
# `temperature` is very low for highly deterministic output.
# Using parameters recommended for IzzulGod/GPT2-Indo-chat-tuned for better balance.
output = llm(prompt,
max_new_tokens=60, # Adjusted for IzzulGod model
do_sample=True,
temperature=0.3, # Adjusted for IzzulGod model
top_k=20, # Adjusted for IzzulGod model
top_p=0.8, # Adjusted for IzzulGod model
pad_token_id=llm.tokenizer.eos_token_id,
num_return_sequences=1 # Ensure only one sequence is returned
)[0]["generated_text"]
# --- Post-processing for Aggressive Cleanup and Deduplication ---
# 1. Extract the generated answer by removing the prompt
generated_answer = output[len(prompt):].strip()
# 2. Aggressively remove common patterns that might start the answer but are not the answer itself
# This list is designed to be general and NOT specific to content.
general_unwanted_starters = [
"Jawaban:", "Tujuan:", "Proses adalah:", "Definisi:", "Penjelasan:", "Hal ini adalah:",
question.lower().strip(), # Remove the question itself if it's repeated (case-insensitive)
"adalah", # If "adalah" stands alone as the start of an answer, it might be noise.
"terdiri dari",
"dapat diterjemahkan oleh",
"bahasa mesin",
"program",
"pengertian", # Specific term from your example that looks like noise
":" # Sometimes a colon might be left
]
# Sort by length descending to remove longer matches first for effective removal
general_unwanted_starters.sort(key=len, reverse=True)
for pattern in general_unwanted_starters:
if generated_answer.lower().startswith(pattern.lower()):
generated_answer = generated_answer[len(pattern):].strip()
if not generated_answer:
break # Stop if answer becomes empty after removal
# 3. General Deduplication of Consecutive Lines (Enhanced for conciseness)
lines = generated_answer.split('\n')
cleaned_lines = []
prev_line_stripped = ""
for line in lines:
current_line_stripped = line.strip()
# Add line if not empty and not a case-insensitive duplicate of the previous non-empty line
# Also, filter out very short, common words that might stand alone as separate lines.
if current_line_stripped and current_line_stripped.lower() != prev_line_stripped.lower():
if len(current_line_stripped.split()) <= 2 and current_line_stripped.lower() in ["pengertian", "adalah", "tujuan", "proses", "terdiri", "bahasa", "mesin"]:
continue # Skip very short, non-substantive lines
cleaned_lines.append(line)
prev_line_stripped = current_line_stripped
generated_answer = "\n".join(cleaned_lines).strip()
# 4. Remove excessive blank lines and clean up whitespace (final pass)
generated_answer = os.linesep.join([s for s in generated_answer.splitlines() if s.strip()])
generated_answer = re.sub(r'\s+', ' ', generated_answer).strip() # Replace multiple spaces with single
# 5. Take only the first sentence for extreme conciseness, if available
if '.' in generated_answer:
final_answer = generated_answer.split('.')[0].strip() + '.'
else:
final_answer = generated_answer.strip()
# 6. Final check for very short/empty answers or answers that are just the question
if not final_answer or final_answer.lower().strip() == "informasi tidak ditemukan." or len(final_answer.split()) < 3:
return "Informasi tidak ditemukan berdasarkan konteks yang relevan."
return final_answer
# === 5. Gradio Interface ===
interface = gr.Interface(
fn=rag_chat,
inputs=[
gr.Dropdown(choices=list(doc_chunks.keys()), label="Pilih Mata Kuliah"),
gr.Textbox(label="Pertanyaan Anda")
],
outputs=gr.Textbox(label="Jawaban"),
title="Chatbot RAG & LLM Mata Kuliah",
description="Tanyakan sesuatu berdasarkan materi tiap mata kuliah.")
if __name__ == "__main__":
interface.launch()