Spaces:

Santipab
/

EEC-Hackathon-Space

Sleeping

App Files Files Community

Santipab commited on Nov 25, 2024

Commit

bde3dc5

verified ·

1 Parent(s): dc4adb2

Upload app.py

Browse files

Files changed (1) hide show

app.py +113 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.vectorstores import Chroma
+from langchain_ollama import embeddings
+from langchain_ollama import ChatOllama
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.output_parsers import PydanticOutputParser
+from langchain.text_splitter import CharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+from aift.multimodal import textqa
+from aift import setting
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import CharacterTextSplitter
+import streamlit as st
+class CustomEmbeddings:
+    def __init__(self, model_name="mrp/simcse-model-m-bert-thai-cased"):
+        """
+        Initialize the embedding model using SentenceTransformer.
+        :param model_name: Name of the pre-trained model
+        """
+        self.model = SentenceTransformer(model_name)
+    def embed_query(self, text):
+        """
+        Generate embeddings for a single query.
+        :param text: Input text to embed
+        :return: Embedding vector as a Python list
+        """
+        embedding = self.model.encode([text])
+        return embedding[0].tolist()  # Convert NumPy array to list
+    async def aembed_query(self, text):
+        """
+        Asynchronous version of `embed_query`.
+        :param text: Input text to embed
+        :return: Embedding vector as a Python list
+        """
+        return self.embed_query(text)
+    def embed_documents(self, texts):
+        """
+        Generate embeddings for multiple documents.
+        :param texts: List of input texts to embed
+        :return: List of embedding vectors as Python lists
+        """
+        embeddings = self.model.encode(texts)
+        return [embedding.tolist() for embedding in embeddings]
+    async def aembed_documents(self, texts):
+        """
+        Asynchronous version of `embed_documents`.
+        :param texts: List of input texts to embed
+        :return: List of embedding vectors as Python lists
+        """
+        return self.embed_documents(texts)
+# Set Pathumma API Key
+setting.set_api_key('T69FqnYgOdreO5G0nZaM8gHcjo1sifyU')
+# Define a simple wrapper for Pathumma
+class PathummaModel:
+    def __init__(self):
+        pass
+    def generate(self, instruction: str, return_json: bool = False):
+        response = textqa.generate(instruction=instruction, return_json=return_json)
+        if return_json:
+            return response.get("content", "")
+        return response
+    def __call__(self, input: str):
+        return self.generate(input, return_json=False)
+# Initialize Pathumma Model
+model_local = PathummaModel()
+# Load the document, split it into chunks, embed each chunk and load it into the vector store.
+raw_documents = TextLoader('./mainn.txt').load()
+text_splitter = CharacterTextSplitter(chunk_size=7500, chunk_overlap=0)
+documents = text_splitter.split_documents(raw_documents)
+# 2. Convert documents to Embeddings and store them
+vectorstore = Chroma.from_documents(
+    documents=documents,
+    collection_name="rag-chroma",
+    embedding=CustomEmbeddings(model_name="mrp/simcse-model-m-bert-thai-cased"),
+)
+retriever = vectorstore.as_retriever()
+after_rag_template = """ตอบคำถามโดยพิจารณาจากบริบทต่อไปนี้เท่านั้น:
+{context}
+คำถาม: {question}
+"""
+after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
+# Query retriever for context and pass to Pathumma
+def system_call(text_input):
+    question = text_input
+    retrieved_context = retriever.invoke(question)
+    context = "\n".join([doc.page_content for doc in retrieved_context])
+    after_rag_chain = after_rag_prompt.invoke({
+        "context": context,
+        "question": question,
+    })
+    response = model_local(after_rag_chain)
+    st.write("response")
+    st.write(response)
+system_call("ผมชื่ออะไรเหรอ")