Spaces:

Darayut
/

SEALION-v3.5-8B-R-RAG

Sleeping

App Files Files Community

Darayut commited on Jul 28, 2025

Commit

2f923a7

verified ·

1 Parent(s): 711b3e6

Update src/simple_rag.py

Browse files

Files changed (1) hide show

src/simple_rag.py +49 -60

src/simple_rag.py CHANGED Viewed

@@ -1,30 +1,43 @@
-# Modified RAG Pipeline for General Document Q&A (Khmer & English)
 import os
 import logging
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.schema import Document
-# Updated imports for LangChain
-from langchain_community.vectorstores import Chroma
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.document_loaders import PyPDFDirectoryLoader
 logging.basicConfig(level=logging.INFO)
 use_gpu = torch.cuda.is_available()
 model_id = "aisingapore/Llama-SEA-LION-v3.5-8B-R"
-logging.info(use_gpu)
 # # Load model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    load_in_8bit=True, # Quantization
-    device_map="cpu",  # Force CPU
-)
 pipeline = pipeline(
     "text-generation",
@@ -32,29 +45,25 @@ pipeline = pipeline(
     tokenizer=tokenizer,
 )
-# Use Hugging Face's writable directory
-WRITABLE_DIR = os.environ.get("HOME", "/app")
-DATA_PATH = os.path.join(WRITABLE_DIR, "src", "data")
-CHROMA_PATH = os.path.join(WRITABLE_DIR, "src", "chroma")
 embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
-# PROMPT_TEMPLATE = """
-# You are a helpful assistant.
-# Answer the question based ONLY on the context below.
-# If the user asks in Khmer, respond in Khmer.
-# If the user asks in English, respond in English.
-# Use clear, concise sentences, no more than 50 word. Do not mention the existence of context.
-# Context:
-# {context}
-# Question:
-# {question}
-# Answer:
-# """
 def load_documents():
     loader = PyPDFDirectoryLoader(DATA_PATH)
@@ -62,7 +71,7 @@ def load_documents():
 def split_text(documents: list[Document]):
     splitter = RecursiveCharacterTextSplitter(
-        chunk_size=256, chunk_overlap=50, length_function=len, add_start_index=True
     )
     chunks = splitter.split_documents(documents)
     logging.info(f"Split {len(documents)} documents into {len(chunks)} chunks.")
@@ -102,44 +111,24 @@ def ask_question(query_text: str, k: int = 3):
         })
     context_text = "\n\n".join(chunk["text"] for chunk in context_chunks)
-    #prompt = PROMPT_TEMPLATE.format(context=context_text, question=query_text)
-    #logging.info(f"Prompt: {prompt}")
-    # Construct structured messages instead of using PROMPT_TEMPLATE
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Base your answer only on the following context:\n\n{context_text}\n\nQuestion: {query_text}\nAnswer:"""
-        }
-    ]
     prompt = tokenizer.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=False,
-        thinking_mode="off"
-    )
-    logging.info(f"Prompts: {prompt}")
     output = pipeline(
         prompt,
         max_new_tokens=128,
-        do_sample=False,
         return_full_text=False,
         truncation=True,
     )
-    # output = pipeline(
-    #     messages,
-    #     max_new_tokens=256,
-    #     return_full_text=False,
-    #     truncation=True,
-    #     do_sample=False,
-    # )
-    logging.info(f"Output: {output}")
     answer = output[0]["generated_text"].strip()
     return answer, context_chunks

 import os
 import logging
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.schema import Document
+from langchain.vectorstores.chroma import Chroma
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.document_loaders import PyPDFDirectoryLoader
+from transformers import BitsAndBytesConfig
 logging.basicConfig(level=logging.INFO)
 use_gpu = torch.cuda.is_available()
+if use_gpu:
+    print("CUDA device in use:", torch.cuda.get_device_name(0))
+else:
+    print("Running on CPU. No GPU detected.")
 model_id = "aisingapore/Llama-SEA-LION-v3.5-8B-R"
 # # Load model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+if use_gpu:
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto",
+        load_in_8bit=True,
+        torch_dtype=torch.float16,
+    )
+else:
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        load_in_8bit=True,
+        device_map={"": "cpu"},  # Force CPU
+    )
 pipeline = pipeline(
     "text-generation",
     tokenizer=tokenizer,
 )
+DATA_PATH = "./data/"
+CHROMA_PATH = "chroma"
 embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
+# Generic assistant prompt for dual Khmer/English
+PROMPT_TEMPLATE = """
+You are a helpful assistant.
+Answer the question based ONLY on the context below.
+Use clear, concise sentences, no more than 50 words. Do not mention the existence of context.
+Context:
+{context}
+Question:
+{question}
+Answer:
+""".strip()
 def load_documents():
     loader = PyPDFDirectoryLoader(DATA_PATH)
 def split_text(documents: list[Document]):
     splitter = RecursiveCharacterTextSplitter(
+        chunk_size=512, chunk_overlap=100, length_function=len, add_start_index=True
     )
     chunks = splitter.split_documents(documents)
     logging.info(f"Split {len(documents)} documents into {len(chunks)} chunks.")
         })
     context_text = "\n\n".join(chunk["text"] for chunk in context_chunks)
+    prompt = PROMPT_TEMPLATE.format(context=context_text, question=query_text)
+    messages = [{"role": "user", "content": prompt}]
+    logging.info("Sending prompt to model...")
     prompt = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False,
+            thinking_mode="off"
+        )
     output = pipeline(
         prompt,
         max_new_tokens=128,
         return_full_text=False,
         truncation=True,
+        do_sample=False,
     )
     answer = output[0]["generated_text"].strip()
     return answer, context_chunks