Spaces:

johnnydang88
/

philippine-labor-code-rag

Paused

App Files Files Community

johnnydang88 commited on Mar 12

Commit

d87893b

verified ·

1 Parent(s): b5830e8

Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
README.md +13 -0
app.py +907 -0
laborcode.pdf +3 -0
requirements.txt +17 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+laborcode.pdf filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Philippine Labor Code RAG Assistant
+emoji: "👷"
+colorFrom: blue
+colorTo: indigo
+sdk: streamlit
+sdk_version: "1.40.0"
+app_file: app.py
+pinned: false
+---
+# Philippine Labor Code & Employee Rights Assistant
+Multi-Model RAG Evaluation: Qwen2.5-7B, LLaMA-3.1-8B, Gemma-2-9B

app.py ADDED Viewed

	@@ -0,0 +1,907 @@

+"""
+Philippine Labor Code & Employee Rights Assistant
+Multi-Model RAG Evaluation: Qwen2.5-7B | LLaMA-3.1-8B | Gemma-2-9B
+Streamlit deployment converts the Colab notebook pipeline into a
+permanent web application. All three models share a single retrieval
+pipeline (BGE-M3 dense + BM25 sparse + RRF fusion + cross-encoder
+reranking + MMR diversity selection).
+"""
+import os
+import re
+import gc
+import time
+import json
+import nltk
+import torch
+import faiss
+import numpy as np
+import streamlit as st
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from collections import Counter
+from pypdf import PdfReader
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from sklearn.metrics.pairwise import cosine_similarity as cos_sim
+from rouge_score import rouge_scorer
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# ---------------------------------------------------------------------------
+# Download NLTK data at startup
+# ---------------------------------------------------------------------------
+nltk.download("punkt", quiet=True)
+nltk.download("punkt_tab", quiet=True)
+# ---------------------------------------------------------------------------
+# Page configuration
+# ---------------------------------------------------------------------------
+st.set_page_config(
+    page_title="Philippine Labor Code RAG Assistant",
+    page_icon=None,
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+PDF_PATH = "laborcode.pdf"
+EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
+RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L12-v2"
+MODEL_CONFIGS = {
+    "Qwen2.5-7B-Instruct": {
+        "hf_id": "Qwen/Qwen2.5-7B-Instruct",
+        "supports_system": True,
+    },
+    "LLaMA-3.1-8B-Instruct": {
+        "hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "supports_system": True,
+    },
+    "Gemma-2-9B-IT": {
+        "hf_id": "google/gemma-2-9b-it",
+        "supports_system": False,
+    },
+}
+SYSTEM_PROMPT = (
+    "You are Lex, a helpful and professional Philippine Labor Law assistant. "
+    "Answer the user's question accurately using ONLY the provided legal context "
+    "from the Philippine Labor Code (Presidential Decree No. 442, as amended). "
+    "Always cite the specific Article number(s) when applicable. "
+    "If the context does not contain enough information to answer, say so honestly. "
+    "Do not fabricate legal provisions."
+)
+# ---------------------------------------------------------------------------
+# Greeting detection (from notebook Cell 7)
+# ---------------------------------------------------------------------------
+GREETING_PATTERNS = [
+    r"^(hi|hello|hey|good morning|good afternoon|good evening|kamusta|kumusta|musta|helo|oi|uy)[\s!?.]*$",
+    r"^(what can you do|what are you|who are you|what is lex|are you a bot|are you ai)[\s?]*$",
+    r"^(thanks?|thank you|salamat|maraming salamat|ty)[\s!.]*$",
+    r"^(bye|goodbye|see you|paalam|ok|okay|sure|alright|got it|noted)[\s!.]*$",
+    r"^(help|tulong|tulungan mo ako)[\s!?]*$",
+]
+GREETING_RESPONSE = (
+    "Hello! I am Lex, your Philippine Labor Law assistant. "
+    "Feel free to ask me any questions about labor rights, employment policies, "
+    "wages, working hours, leaves, termination, or any workplace concerns under "
+    "the Philippine Labor Code (PD 442). How can I help you today?"
+)
+def is_greeting(text: str) -> bool:
+    t = text.strip().lower()
+    for pat in GREETING_PATTERNS:
+        if re.match(pat, t, re.IGNORECASE):
+            return True
+    legal_keywords = [
+        "article", "labor", "wage", "leave", "work", "employ",
+        "salary", "pay", "overtime", "holiday", "terminate",
+        "strike", "union", "dole", "law", "code", "right",
+        "benefit", "retire", "resign", "dismiss",
+    ]
+    tokens = t.split()
+    if len(tokens) <= 3 and not any(kw in t for kw in legal_keywords):
+        return True
+    return False
+# ---------------------------------------------------------------------------
+# PDF processing and chunking (from notebook Cells 4-5)
+# ---------------------------------------------------------------------------
+def extract_text_from_pdf(pdf_path: str) -> str:
+    reader = PdfReader(pdf_path)
+    text = ""
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text += page_text + "\n"
+    return text
+def clean_text(text: str) -> str:
+    text = re.sub(r"(ART\.)\s*(\d+)\s+(\d+)\.", r"\1 \2\3.", text)
+    text = re.sub(r"(Article\s+)(\d+)\s+(\d+)", r"\1\2\3", text)
+    text = re.sub(r"---\s*Page\s*\d+\s*---", "", text)
+    text = re.sub(
+        r"\n\s*\d{1,3}\s+(?:See|As amended|R\.A\.|P\.D\.|E\.O\.|The |This |Pursuant|Section|Sec\.).*",
+        "",
+        text,
+        flags=re.IGNORECASE,
+    )
+    text = re.sub(r"\[Footnote\].*?\n", "\n", text, flags=re.DOTALL)
+    text = re.sub(r"\n\s*\d{1,4}\s*\n", "\n", text)
+    text = re.sub(r"[ \t]{3,}", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+def is_substantive_chunk(chunk: str, max_footnote_ratio: float = 0.08) -> bool:
+    footnote_markers = [
+        "[Footnote]", "See DOLE", "As amended by", "superseded by",
+        "cross-reference", "R.A. No.", "P.D. No.", "E.O. No.",
+        "pursuant to", "inserted in", "renumbered as",
+    ]
+    words = chunk.split()
+    if len(words) == 0:
+        return False
+    footnote_hits = sum(chunk.lower().count(m.lower()) for m in footnote_markers)
+    ratio = footnote_hits / len(words)
+    return ratio < max_footnote_ratio
+def fix_broken_article_header(chunk: str) -> str:
+    return re.sub(
+        r"(ART\.?\s*)(\d)\s+(\d+\.)",
+        lambda m: m.group(1) + m.group(2) + m.group(3),
+        chunk,
+        flags=re.IGNORECASE,
+    )
+def chunk_text_by_article(
+    text: str, max_len: int = 1200, overlap: int = 200, min_len: int = 100
+) -> list[str]:
+    article_pattern = re.compile(
+        r"(?=(?:ART\.|Art\.|ARTICLE)\s+\d+[\.\ ])", re.IGNORECASE
+    )
+    raw_splits = article_pattern.split(text)
+    chunks = []
+    for block in raw_splits:
+        block = block.strip()
+        if not block:
+            continue
+        if len(block) <= max_len:
+            if len(block) >= min_len:
+                chunks.append(block)
+        else:
+            header_match = re.match(
+                r"((?:ART\.|Art\.|ARTICLE)\s+\d+[^.]*\.)", block
+            )
+            header = header_match.group(1).strip() if header_match else ""
+            sentences = re.split(r"(?<=[.!?;])\s+", block)
+            current = ""
+            chunk_num = 0
+            for sent in sentences:
+                if len(current) + len(sent) > max_len:
+                    if current:
+                        chunks.append(current.strip())
+                        chunk_num += 1
+                        tail = current[-overlap:] if len(current) > overlap else ""
+                        current = (
+                            (header + " [cont] " + tail)
+                            if header and chunk_num > 0
+                            else tail
+                        )
+                current += " " + sent
+            if current.strip() and len(current.strip()) >= min_len:
+                chunks.append(current.strip())
+    boilerplate_patterns = [
+        "NOT FOR SALE", "Copyright", "SILVESTRE H. BELLO",
+        "Table of Contents", "FOREWORD", "www.dole.gov.ph",
+        "Repealing Clause", "cross-references all superseded",
+        "Name of Decree",
+    ]
+    chunks = [
+        c
+        for c in chunks
+        if not any(bp.lower() in c.lower() for bp in boilerplate_patterns)
+        and len(c.strip()) > min_len
+        and is_substantive_chunk(c)
+    ]
+    chunks = [fix_broken_article_header(c) for c in chunks]
+    return chunks
+# ---------------------------------------------------------------------------
+# Retrieval functions (from notebook Cells 6-7)
+# ---------------------------------------------------------------------------
+def mmr_select(
+    candidates: list[str],
+    scores: list[float],
+    embeddings: np.ndarray,
+    k: int = 5,
+    lam: float = 0.6,
+):
+    if len(candidates) <= k:
+        return candidates, scores
+    embs = np.array(embeddings)
+    selected_idx = []
+    remaining = list(range(len(candidates)))
+    while len(selected_idx) < k and remaining:
+        if not selected_idx:
+            best = max(remaining, key=lambda i: scores[i])
+        else:
+            sel_embs = embs[selected_idx]
+            def mmr_score(i, _sel_embs=sel_embs):
+                rel = scores[i]
+                sim = float(np.max(_sel_embs @ embs[i]))
+                return lam * rel - (1.0 - lam) * sim
+            best = max(remaining, key=mmr_score)
+        selected_idx.append(best)
+        remaining.remove(best)
+    return [candidates[i] for i in selected_idx], [scores[i] for i in selected_idx]
+def deduplicate_by_article(
+    ranked_pairs: list, max_per_article: int = 2, final_k: int = 5
+) -> list:
+    seen_art = {}
+    final = []
+    for chunk, score in ranked_pairs:
+        match = re.match(r"(ART\.?\s*\d+)", chunk, re.IGNORECASE)
+        key = match.group(1).upper().replace(" ", "") if match else "UNK"
+        count = seen_art.get(key, 0)
+        if count < max_per_article:
+            final.append((chunk, score))
+            seen_art[key] = count + 1
+        if len(final) == final_k:
+            break
+    return final
+def hybrid_retrieve_and_rerank(
+    question: str,
+    embedder: SentenceTransformer,
+    index: faiss.IndexFlatIP,
+    bm25: BM25Okapi,
+    reranker: CrossEncoder,
+    chunks: list[str],
+    initial_k: int = 20,
+    rerank_k: int = 8,
+    final_k: int = 5,
+):
+    # Dense retrieval
+    query_prefixed = f"query: {question}"
+    query_emb = embedder.encode(
+        [query_prefixed], convert_to_numpy=True, normalize_embeddings=True
+    )
+    dense_scores, dense_indices = index.search(query_emb, initial_k)
+    dense_ranking = list(dense_indices[0])
+    # BM25
+    bm25_scores = bm25.get_scores(question.lower().split())
+    bm25_ranking = list(np.argsort(bm25_scores)[::-1][:initial_k])
+    # RRF
+    rrf_k = 60
+    rrf_scores = {}
+    for rank, idx in enumerate(dense_ranking):
+        rrf_scores[idx] = rrf_scores.get(idx, 0.0) + 1.0 / (rank + rrf_k)
+    for rank, idx in enumerate(bm25_ranking):
+        rrf_scores[idx] = rrf_scores.get(idx, 0.0) + 1.0 / (rank + rrf_k)
+    fused_indices = sorted(rrf_scores, key=rrf_scores.get, reverse=True)[:initial_k]
+    candidate_chunks = [chunks[i] for i in fused_indices]
+    # Cross-encoder reranking
+    pairs = [[question, chunk] for chunk in candidate_chunks]
+    rerank_scores_arr = reranker.predict(pairs)
+    ranked_all = sorted(
+        zip(candidate_chunks, rerank_scores_arr.tolist()),
+        key=lambda x: x[1],
+        reverse=True,
+    )[:rerank_k]
+    # Per-article deduplication
+    deduped = deduplicate_by_article(ranked_all, max_per_article=2, final_k=rerank_k)
+    dedup_chunks = [x[0] for x in deduped]
+    dedup_scores = [x[1] for x in deduped]
+    # MMR selection
+    cand_embs = embedder.encode(
+        [f"passage: {c}" for c in dedup_chunks],
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+    )
+    top_chunks, top_scores = mmr_select(
+        dedup_chunks, dedup_scores, cand_embs, k=final_k, lam=0.6
+    )
+    return top_chunks, top_scores
+# ---------------------------------------------------------------------------
+# Model loading and generation
+# ---------------------------------------------------------------------------
+def load_model_and_tokenizer(model_name: str):
+    """Load a model with 4-bit quantization. Returns (model, tokenizer)."""
+    config = MODEL_CONFIGS[model_name]
+    hf_id = config["hf_id"]
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(hf_id, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        hf_id,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        torch_dtype=torch.float16,
+    )
+    model.eval()
+    return model, tokenizer
+def unload_model(model, tokenizer):
+    """Free GPU memory."""
+    del model
+    del tokenizer
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def build_prompt(
+    model_name: str,
+    question: str,
+    context_chunks: list[str],
+    tokenizer,
+) -> str:
+    """Build a chat-formatted prompt appropriate for each model."""
+    context_block = "\n\n---\n\n".join(context_chunks)
+    user_content = (
+        f"CONTEXT (from the Philippine Labor Code):\n{context_block}\n\n"
+        f"QUESTION: {question}\n\n"
+        f"Provide a clear, accurate answer citing specific Article numbers."
+    )
+    config = MODEL_CONFIGS[model_name]
+    if config["supports_system"]:
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": user_content},
+        ]
+    else:
+        # Gemma-2 does not support system role; inject into first user turn
+        combined = f"{SYSTEM_PROMPT}\n\n{user_content}"
+        messages = [{"role": "user", "content": combined}]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    return prompt
+def generate_answer(
+    model,
+    tokenizer,
+    prompt: str,
+    max_new_tokens: int = 512,
+) -> str:
+    """Generate an answer from the model."""
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=0.3,
+            top_p=0.9,
+            do_sample=True,
+            repetition_penalty=1.15,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    # Decode only the newly generated tokens
+    input_len = inputs["input_ids"].shape[1]
+    answer = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True).strip()
+    return answer
+# ---------------------------------------------------------------------------
+# Evaluation metrics (from the notebook)
+# ---------------------------------------------------------------------------
+def compute_faithfulness(answer: str, context_chunks: list[str], embedder) -> float:
+    """Semantic similarity between the answer and the retrieved context."""
+    if not answer or not context_chunks:
+        return 0.0
+    context_combined = " ".join(context_chunks)
+    embs = embedder.encode(
+        [answer, context_combined], convert_to_numpy=True, normalize_embeddings=True
+    )
+    return float(cos_sim([embs[0]], [embs[1]])[0][0])
+def compute_semantic_similarity(
+    answer: str, ground_truth: str, embedder
+) -> float:
+    """Cosine similarity between answer and ground truth embeddings."""
+    if not answer or not ground_truth:
+        return 0.0
+    embs = embedder.encode(
+        [answer, ground_truth], convert_to_numpy=True, normalize_embeddings=True
+    )
+    return float(cos_sim([embs[0]], [embs[1]])[0][0])
+def compute_answer_relevancy(answer: str, question: str, embedder) -> float:
+    """Cosine similarity between answer and question embeddings."""
+    if not answer or not question:
+        return 0.0
+    embs = embedder.encode(
+        [answer, question], convert_to_numpy=True, normalize_embeddings=True
+    )
+    return float(cos_sim([embs[0]], [embs[1]])[0][0])
+def compute_citation_accuracy(
+    answer: str, expected_articles: list[str]
+) -> float:
+    """Fraction of expected article numbers that appear in the answer."""
+    if not expected_articles:
+        return 1.0
+    found = 0
+    for art in expected_articles:
+        patterns = [
+            rf"Article\s*{art}\b",
+            rf"Art\.?\s*{art}\b",
+            rf"ART\.?\s*{art}\b",
+        ]
+        if any(re.search(p, answer, re.IGNORECASE) for p in patterns):
+            found += 1
+    return found / len(expected_articles)
+def compute_rouge_l(answer: str, ground_truth: str) -> float:
+    """ROUGE-L F1 score."""
+    if not answer or not ground_truth:
+        return 0.0
+    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
+    scores = scorer.score(ground_truth, answer)
+    return scores["rougeL"].fmeasure
+def compute_retrieval_recall(
+    chunks: list[str], expected_articles: list[str]
+) -> float:
+    """Fraction of expected articles found in retrieved chunks."""
+    if not expected_articles:
+        return 1.0
+    found = 0
+    combined = " ".join(chunks)
+    for art in expected_articles:
+        patterns = [
+            rf"Article\s*{art}\b",
+            rf"Art\.?\s*{art}\b",
+            rf"ART\.?\s*{art}\b",
+        ]
+        if any(re.search(p, combined, re.IGNORECASE) for p in patterns):
+            found += 1
+    return found / len(expected_articles)
+def compute_retrieval_precision(
+    chunks: list[str], expected_articles: list[str]
+) -> float:
+    """Fraction of retrieved chunks that contain at least one expected article."""
+    if not chunks:
+        return 0.0
+    relevant_count = 0
+    for chunk in chunks:
+        for art in expected_articles:
+            patterns = [
+                rf"Article\s*{art}\b",
+                rf"Art\.?\s*{art}\b",
+                rf"ART\.?\s*{art}\b",
+            ]
+            if any(re.search(p, chunk, re.IGNORECASE) for p in patterns):
+                relevant_count += 1
+                break
+    return relevant_count / len(chunks)
+def evaluate_single_response(
+    question: str,
+    answer: str,
+    context_chunks: list[str],
+    ground_truth: str,
+    expected_articles: list[str],
+    embedder,
+) -> dict:
+    """Compute all evaluation metrics for a single response."""
+    return {
+        "Faithfulness": round(
+            compute_faithfulness(answer, context_chunks, embedder), 4
+        ),
+        "Semantic Similarity": round(
+            compute_semantic_similarity(answer, ground_truth, embedder), 4
+        ),
+        "Answer Relevancy": round(
+            compute_answer_relevancy(answer, question, embedder), 4
+        ),
+        "Citation Accuracy": round(
+            compute_citation_accuracy(answer, expected_articles), 4
+        ),
+        "ROUGE-L": round(compute_rouge_l(answer, ground_truth), 4),
+        "Recall@5": round(
+            compute_retrieval_recall(context_chunks, expected_articles), 4
+        ),
+        "Precision@5": round(
+            compute_retrieval_precision(context_chunks, expected_articles), 4
+        ),
+    }
+# ---------------------------------------------------------------------------
+# Visualization
+# ---------------------------------------------------------------------------
+def render_comparison_chart(all_metrics: dict) -> plt.Figure:
+    """
+    Create a grouped bar chart comparing metrics across models.
+    all_metrics: { "ModelName": { "MetricName": value, ... }, ... }
+    """
+    metric_names = [
+        "Faithfulness",
+        "Semantic Similarity",
+        "Answer Relevancy",
+        "Citation Accuracy",
+        "ROUGE-L",
+        "Recall@5",
+        "Precision@5",
+    ]
+    model_names = list(all_metrics.keys())
+    n_metrics = len(metric_names)
+    n_models = len(model_names)
+    x = np.arange(n_metrics)
+    width = 0.8 / max(n_models, 1)
+    colors = ["#2563eb", "#dc2626", "#16a34a"]
+    fig, ax = plt.subplots(figsize=(14, 6))
+    for i, model in enumerate(model_names):
+        values = [all_metrics[model].get(m, 0.0) for m in metric_names]
+        offset = (i - n_models / 2 + 0.5) * width
+        bars = ax.bar(x + offset, values, width, label=model, color=colors[i % 3])
+        for bar, val in zip(bars, values):
+            ax.text(
+                bar.get_x() + bar.get_width() / 2,
+                bar.get_height() + 0.01,
+                f"{val:.2f}",
+                ha="center",
+                va="bottom",
+                fontsize=7,
+            )
+    ax.set_ylabel("Score")
+    ax.set_title("Multi-Model RAG Evaluation Comparison")
+    ax.set_xticks(x)
+    ax.set_xticklabels(metric_names, rotation=30, ha="right")
+    ax.set_ylim(0, 1.15)
+    ax.legend(loc="upper right")
+    ax.grid(axis="y", alpha=0.3)
+    fig.tight_layout()
+    return fig
+# ---------------------------------------------------------------------------
+# Cached resource loaders
+# ---------------------------------------------------------------------------
+@st.cache_resource(show_spinner="Loading PDF and building document chunks...")
+def load_chunks():
+    if not os.path.exists(PDF_PATH):
+        st.error(
+            f"PDF file not found at '{PDF_PATH}'. "
+            "Please place 'laborcode.pdf' in the application directory."
+        )
+        st.stop()
+    raw_text = extract_text_from_pdf(PDF_PATH)
+    cleaned = clean_text(raw_text)
+    ch = chunk_text_by_article(cleaned)
+    return ch
+@st.cache_resource(show_spinner="Loading embedding model and building indices...")
+def load_retrieval_infra(_chunks: list[str]):
+    embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
+    prefixed = [f"passage: {c}" for c in _chunks]
+    chunk_embeddings = embedder.encode(
+        prefixed,
+        batch_size=16,
+        show_progress_bar=False,
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+    )
+    dimension = chunk_embeddings.shape[1]
+    idx = faiss.IndexFlatIP(dimension)
+    idx.add(chunk_embeddings)
+    tokenized = [c.lower().split() for c in _chunks]
+    bm25 = BM25Okapi(tokenized)
+    reranker = CrossEncoder(RERANKER_MODEL_NAME)
+    return embedder, idx, bm25, reranker
+# ---------------------------------------------------------------------------
+# Main application
+# ---------------------------------------------------------------------------
+def main():
+    # --- Sidebar ---
+    st.sidebar.title("Configuration")
+    st.sidebar.markdown("---")
+    st.sidebar.subheader("About")
+    st.sidebar.markdown(
+        "**Philippine Labor Code RAG Assistant**\n\n"
+        "This application uses Retrieval-Augmented Generation to answer "
+        "questions about the Philippine Labor Code (PD 442). Three models "
+        "are evaluated side-by-side:\n\n"
+        "- Qwen2.5-7B-Instruct\n"
+        "- LLaMA-3.1-8B-Instruct\n"
+        "- Gemma-2-9B-IT\n\n"
+        "All models use 4-bit quantization and share the same hybrid "
+        "retrieval pipeline (BGE-M3 + BM25 + RRF + cross-encoder reranking)."
+    )
+    st.sidebar.markdown("---")
+    st.sidebar.subheader("Pipeline Parameters")
+    top_k = st.sidebar.slider("Final retrieved chunks (top-k)", 3, 10, 5)
+    max_tokens = st.sidebar.slider("Max generation tokens", 128, 1024, 512, step=64)
+    st.sidebar.markdown("---")
+    st.sidebar.subheader("Ground Truth (optional)")
+    ground_truth = st.sidebar.text_area(
+        "Expected answer for evaluation",
+        placeholder="Paste ground truth here to compute evaluation metrics...",
+        height=120,
+    )
+    expected_articles_raw = st.sidebar.text_input(
+        "Expected article numbers (comma-separated)",
+        placeholder="e.g. 83, 86, 94",
+    )
+    expected_articles = [
+        a.strip() for a in expected_articles_raw.split(",") if a.strip()
+    ]
+    # --- Main area ---
+    st.title("Philippine Labor Code & Employee Rights Assistant")
+    st.markdown(
+        "Multi-Model RAG Evaluation: "
+        "**Qwen2.5-7B** | **LLaMA-3.1-8B** | **Gemma-2-9B**"
+    )
+    st.markdown("---")
+    # Load retrieval infrastructure
+    chunks = load_chunks()
+    embedder, faiss_index, bm25, reranker = load_retrieval_infra(chunks)
+    st.success(
+        f"Retrieval pipeline ready. {len(chunks)} document chunks indexed."
+    )
+    # --- Query input ---
+    st.subheader("Ask a Question")
+    question = st.text_input(
+        "Enter your question about the Philippine Labor Code:",
+        placeholder="e.g. What are the just causes for termination by employer?",
+    )
+    if not question:
+        st.info(
+            "Type a question above and press Enter to query all three models."
+        )
+        return
+    # --- Greeting check ---
+    if is_greeting(question):
+        st.markdown("### Response")
+        st.info(GREETING_RESPONSE)
+        return
+    # --- Retrieval ---
+    st.markdown("---")
+    with st.spinner("Retrieving relevant context from the Labor Code..."):
+        retrieval_start = time.time()
+        top_chunks, top_scores = hybrid_retrieve_and_rerank(
+            question=question,
+            embedder=embedder,
+            index=faiss_index,
+            bm25=bm25,
+            reranker=reranker,
+            chunks=chunks,
+            initial_k=20,
+            rerank_k=8,
+            final_k=top_k,
+        )
+        retrieval_time = time.time() - retrieval_start
+    # --- Display retrieved chunks ---
+    st.subheader("Retrieved Context Chunks")
+    st.caption(f"Retrieval completed in {retrieval_time:.2f}s")
+    for i, (chunk, score) in enumerate(zip(top_chunks, top_scores)):
+        with st.expander(
+            f"Chunk {i + 1}  |  Reranker score: {score:.4f}",
+            expanded=(i == 0),
+        ):
+            st.text(chunk)
+    # --- Generation across all three models ---
+    st.markdown("---")
+    st.subheader("Model Responses")
+    all_answers = {}
+    all_metrics = {}
+    all_latencies = {}
+    model_names = list(MODEL_CONFIGS.keys())
+    cols = st.columns(len(model_names))
+    for col, model_name in zip(cols, model_names):
+        with col:
+            st.markdown(f"**{model_name}**")
+            status_placeholder = st.empty()
+            answer_placeholder = st.empty()
+            latency_placeholder = st.empty()
+            status_placeholder.warning("Loading model...")
+            try:
+                gen_start = time.time()
+                model, tokenizer = load_model_and_tokenizer(model_name)
+                status_placeholder.warning("Generating response...")
+                prompt = build_prompt(model_name, question, top_chunks, tokenizer)
+                answer = generate_answer(
+                    model, tokenizer, prompt, max_new_tokens=max_tokens
+                )
+                gen_time = time.time() - gen_start
+                # Unload to free GPU memory for the next model
+                unload_model(model, tokenizer)
+                all_answers[model_name] = answer
+                all_latencies[model_name] = gen_time
+                status_placeholder.empty()
+                answer_placeholder.markdown(answer)
+                latency_placeholder.caption(
+                    f"Generation time: {gen_time:.1f}s"
+                )
+            except Exception as e:
+                status_placeholder.empty()
+                answer_placeholder.error(
+                    f"Failed to load or run {model_name}: {str(e)}"
+                )
+                all_answers[model_name] = ""
+                all_latencies[model_name] = 0.0
+    # --- Evaluation metrics ---
+    st.markdown("---")
+    st.subheader("Evaluation Metrics")
+    if not ground_truth and not expected_articles:
+        st.info(
+            "To view full evaluation metrics, provide a ground truth answer "
+            "and/or expected article numbers in the sidebar."
+        )
+        # Still compute the metrics that do not require ground truth
+        for model_name, answer in all_answers.items():
+            if answer:
+                metrics = {
+                    "Faithfulness": round(
+                        compute_faithfulness(answer, top_chunks, embedder), 4
+                    ),
+                    "Answer Relevancy": round(
+                        compute_answer_relevancy(answer, question, embedder), 4
+                    ),
+                }
+                all_metrics[model_name] = metrics
+    else:
+        gt = ground_truth if ground_truth else ""
+        for model_name, answer in all_answers.items():
+            if answer:
+                metrics = evaluate_single_response(
+                    question=question,
+                    answer=answer,
+                    context_chunks=top_chunks,
+                    ground_truth=gt,
+                    expected_articles=expected_articles,
+                    embedder=embedder,
+                )
+                metrics["Latency (s)"] = round(all_latencies.get(model_name, 0), 2)
+                all_metrics[model_name] = metrics
+    if all_metrics:
+        # Display as a comparison table
+        st.markdown("#### Metric Comparison Table")
+        # Build table data
+        all_metric_keys = []
+        for m in all_metrics.values():
+            for k in m:
+                if k not in all_metric_keys:
+                    all_metric_keys.append(k)
+        table_header = "| Metric | " + " | ".join(all_metrics.keys()) + " |"
+        table_sep = "|---|" + "|".join(["---"] * len(all_metrics)) + "|"
+        table_rows = []
+        for metric_key in all_metric_keys:
+            row = f"| {metric_key} |"
+            for model_name in all_metrics:
+                val = all_metrics[model_name].get(metric_key, "N/A")
+                if isinstance(val, float):
+                    row += f" {val:.4f} |"
+                else:
+                    row += f" {val} |"
+            table_rows.append(row)
+        st.markdown("\n".join([table_header, table_sep] + table_rows))
+        # Render comparison chart (only for 0-1 range metrics)
+        chart_metrics = {}
+        displayable = [
+            "Faithfulness", "Semantic Similarity", "Answer Relevancy",
+            "Citation Accuracy", "ROUGE-L", "Recall@5", "Precision@5",
+        ]
+        for model_name, metrics in all_metrics.items():
+            chart_metrics[model_name] = {
+                k: v for k, v in metrics.items() if k in displayable
+            }
+        if any(chart_metrics.values()):
+            st.markdown("#### Visual Comparison")
+            fig = render_comparison_chart(chart_metrics)
+            st.pyplot(fig)
+            plt.close(fig)
+    # --- Summary ---
+    st.markdown("---")
+    st.subheader("Summary")
+    summary_cols = st.columns(len(model_names))
+    for col, model_name in zip(summary_cols, model_names):
+        with col:
+            st.markdown(f"**{model_name}**")
+            latency = all_latencies.get(model_name, 0)
+            answer = all_answers.get(model_name, "")
+            word_count = len(answer.split()) if answer else 0
+            st.markdown(f"- Response length: {word_count} words")
+            st.markdown(f"- Total latency: {latency:.1f}s")
+            if model_name in all_metrics:
+                faith = all_metrics[model_name].get("Faithfulness", "N/A")
+                if isinstance(faith, float):
+                    st.markdown(f"- Faithfulness: {faith:.4f}")
+                rel = all_metrics[model_name].get("Answer Relevancy", "N/A")
+                if isinstance(rel, float):
+                    st.markdown(f"- Answer Relevancy: {rel:.4f}")
+if __name__ == "__main__":
+    main()

laborcode.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9abd6f8eb3fb329456e926aa46ce05fbfe75f562f8aa6a60e7f421de97295c6
+size 1532508

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+torch
+transformers>=4.44.0
+bitsandbytes
+accelerate
+sentencepiece
+protobuf
+sentence-transformers
+faiss-cpu
+rank-bm25
+pypdf
+rouge-score
+nltk
+scikit-learn
+numpy
+matplotlib
+streamlit>=1.30.0
+langdetect