Spaces:

cheryl19
/

ITBOT

Sleeping

App Files Files Community

ITBOT / app.py

cheryl19

Update app.py

7eb60d5 verified 10 months ago

raw

history blame contribute delete

9.49 kB

	import os
	import torch
	import faiss
	import numpy as np
	import gradio as gr
	import re # Import regex for advanced text cleaning
	from transformers import AutoTokenizer, AutoModel, pipeline
	from sklearn.preprocessing import normalize

	# === 1. Load IndoBERT for embedding ===
	embed_tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
	embed_model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

	def get_embedding(text):
	"""Generates an embedding for the given text using IndoBERT."""
	inputs = embed_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
	with torch.no_grad():
	outputs = embed_model(**inputs)
	embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
	return embedding

	# === 2. Load GPT2 LLM Bahasa Indonesia ===
	# Initialize the text generation pipeline with the GPT2 model
	llm = pipeline("text-generation", model="IzzulGod/GPT2-Indo-chat-tuned")

	# === 3. Load Documents & Build FAISS Index ===
	DATA_DIR = "data"
	doc_chunks = {} # Stores chunks of documents: mata_kuliah -> [list of text chunks]
	doc_indexes = {} # Stores FAISS indexes for each mata_kuliah: mata_kuliah -> FAISS index

	# Function to clean raw text from irrelevant patterns (moved here for clarity)
	def clean_document_text(text: str) -> str:
	"""
	Cleans document text by removing common irrelevant patterns like URLs, tags,
	footers, headers, and excessive whitespace. This is crucial for accurate retrieval.
	"""
	# Remove URLs
	text = re.sub(r'http\S+\|www\S+', '', text, flags=re.MULTILINE)
	# Remove common irrelevant lines (e.g., source, tags, page numbers, navigation)
	text = re.sub(r'Sumber:.*', '', text)
	text = re.sub(r'Tags:.*', '', text)
	text = re.sub(r'^\d+\spemikiran pada “.”', '', text, flags=re.MULTILINE)
	text = re.sub(r'←.*→', '', text)
	text = re.sub(r'^\d+$', '', text, flags=re.MULTILINE) # Remove lines that are just numbers (like page numbers)

	# Remove excessive spaces and normalize newlines
	text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space
	text = re.sub(r'\n+', '\n', text).strip() # Replace multiple newlines with single newline
	return text

	# Process each text file in the data directory
	for fname in os.listdir(DATA_DIR):
	if fname.endswith(".txt"):
	matkul = os.path.splitext(fname)[0].upper() # Extract subject name from filename
	with open(os.path.join(DATA_DIR, fname), encoding='utf-8') as f:
	raw_text = f.read()
	# Apply cleaning BEFORE chunking and embedding
	cleaned_text = clean_document_text(raw_text)

	# Split document into chunks. Adjust chunk size (e.g., 300-700) based on content.
	# A smaller chunk size (e.g., 300) might be better if you want very concise answers
	# and want to ensure a single relevant sentence isn't split across chunks.
	chunks = [cleaned_text[i:i+300] for i in range(0, len(cleaned_text), 300)]
	doc_chunks[matkul] = chunks

	# Generate embeddings for all chunks and normalize them
	embeddings = np.array([get_embedding(chunk) for chunk in chunks])
	embeddings = normalize(embeddings) # Normalize embeddings for better FAISS performance

	# Create a FAISS index for efficient similarity search
	index = faiss.IndexFlatL2(embeddings.shape[1]) # L2 distance for similarity
	index.add(embeddings) # Add embeddings to the index
	doc_indexes[matkul] = index

	# === 4. RAG Function ===
	def rag_chat(matkul: str, question: str) -> str:
	"""
	Retrieves relevant context and generates a concise, relevant answer using LLM.
	Args:
	matkul (str): The selected subject (mata kuliah).
	question (str): The user's question.
	Returns:
	str: The generated answer, cleaned and deduplicated to be very concise.
	"""
	if matkul not in doc_indexes:
	return "Mata kuliah tidak ditemukan."

	# Generate embedding for the user's question
	query_embed = get_embedding(question)
	query_embed = normalize(query_embed.reshape(1, -1))

	# Search for top-k (e.g., 3 or 5) most similar chunks in the FAISS index
	# K=5 is a good balance for capturing relevant context.
	D, I = doc_indexes[matkul].search(query_embed, k=5)
	context = "\n".join([doc_chunks[matkul][i] for i in I[0]])

	# --- Prompt Optimized for Extreme Conciseness and Directness ---
	# The prompt explicitly asks for ONLY the direct answer and nothing else.
	# It strongly discourages extra text and encourages directness.
	prompt = f"""Sebagai asisten AI, berikan jawaban paling singkat dan langsung untuk pertanyaan berikut.
	Gunakan hanya informasi dari bagian "Informasi Relevan" di bawah ini.
	Jangan mengulang pertanyaan, menambahkan kalimat pengantar/penutup, atau informasi lain.
	Fokus pada inti definisi atau penjelasan yang diminta. Jika informasi tidak cukup, jawab "Informasi tidak ditemukan."

	Informasi Relevan dari mata kuliah {matkul}:
	{context}

	Pertanyaan: {question}
	Jawaban:"""

	# --- Text Generation Parameters Optimized for Conciseness ---
	# `max_new_tokens` is significantly reduced.
	# `temperature` is very low for highly deterministic output.
	# Using parameters recommended for IzzulGod/GPT2-Indo-chat-tuned for better balance.
	output = llm(prompt,
	max_new_tokens=60, # Adjusted for IzzulGod model
	do_sample=True,
	temperature=0.3, # Adjusted for IzzulGod model
	top_k=20, # Adjusted for IzzulGod model
	top_p=0.8, # Adjusted for IzzulGod model
	pad_token_id=llm.tokenizer.eos_token_id,
	num_return_sequences=1 # Ensure only one sequence is returned
	)[0]["generated_text"]

	# --- Post-processing for Aggressive Cleanup and Deduplication ---
	# 1. Extract the generated answer by removing the prompt
	generated_answer = output[len(prompt):].strip()

	# 2. Aggressively remove common patterns that might start the answer but are not the answer itself
	# This list is designed to be general and NOT specific to content.
	general_unwanted_starters = [
	"Jawaban:", "Tujuan:", "Proses adalah:", "Definisi:", "Penjelasan:", "Hal ini adalah:",
	question.lower().strip(), # Remove the question itself if it's repeated (case-insensitive)
	"adalah", # If "adalah" stands alone as the start of an answer, it might be noise.
	"terdiri dari",
	"dapat diterjemahkan oleh",
	"bahasa mesin",
	"program",
	"pengertian", # Specific term from your example that looks like noise
	":" # Sometimes a colon might be left
	]

	# Sort by length descending to remove longer matches first for effective removal
	general_unwanted_starters.sort(key=len, reverse=True)

	for pattern in general_unwanted_starters:
	if generated_answer.lower().startswith(pattern.lower()):
	generated_answer = generated_answer[len(pattern):].strip()
	if not generated_answer:
	break # Stop if answer becomes empty after removal

	# 3. General Deduplication of Consecutive Lines (Enhanced for conciseness)
	lines = generated_answer.split('\n')
	cleaned_lines = []
	prev_line_stripped = ""

	for line in lines:
	current_line_stripped = line.strip()
	# Add line if not empty and not a case-insensitive duplicate of the previous non-empty line
	# Also, filter out very short, common words that might stand alone as separate lines.
	if current_line_stripped and current_line_stripped.lower() != prev_line_stripped.lower():
	if len(current_line_stripped.split()) <= 2 and current_line_stripped.lower() in ["pengertian", "adalah", "tujuan", "proses", "terdiri", "bahasa", "mesin"]:
	continue # Skip very short, non-substantive lines
	cleaned_lines.append(line)
	prev_line_stripped = current_line_stripped

	generated_answer = "\n".join(cleaned_lines).strip()

	# 4. Remove excessive blank lines and clean up whitespace (final pass)
	generated_answer = os.linesep.join([s for s in generated_answer.splitlines() if s.strip()])
	generated_answer = re.sub(r'\s+', ' ', generated_answer).strip() # Replace multiple spaces with single

	# 5. Take only the first sentence for extreme conciseness, if available
	if '.' in generated_answer:
	final_answer = generated_answer.split('.')[0].strip() + '.'
	else:
	final_answer = generated_answer.strip()

	# 6. Final check for very short/empty answers or answers that are just the question
	if not final_answer or final_answer.lower().strip() == "informasi tidak ditemukan." or len(final_answer.split()) < 3:
	return "Informasi tidak ditemukan berdasarkan konteks yang relevan."

	return final_answer

	# === 5. Gradio Interface ===
	interface = gr.Interface(
	fn=rag_chat,
	inputs=[
	gr.Dropdown(choices=list(doc_chunks.keys()), label="Pilih Mata Kuliah"),
	gr.Textbox(label="Pertanyaan Anda")
	],
	outputs=gr.Textbox(label="Jawaban"),
	title="Chatbot RAG & LLM Mata Kuliah",
	description="Tanyakan sesuatu berdasarkan materi tiap mata kuliah.")

	if __name__ == "__main__":
	interface.launch()