microhum commited on
Commit
d8023d7
·
1 Parent(s): 7eff878

initial commit

Browse files
Files changed (14) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +18 -0
  3. app.py +55 -60
  4. files/กระบวนการของบริษัทการบินไทย จำกัด (มหาชน) ในการฟื้นฟูกิจการ ภายใต้บริบทการเมืองไทย ในช่วงปี พ.ศ. 2562 - 2566.pdf +3 -0
  5. files/กลยุทธ์การหาเสียงของพรรคการเมืองกับแผนยุทธศาสตร์ชาติ 20 ปี.pdf +3 -0
  6. files/การนำองค์กรของผู้บริหารต่อประสิทธิผลตามพันธกิจของมหาวิทยาลัยเทคโนโลยีราชมงคลรัตนโกสินทร์.pdf +3 -0
  7. files/ความสัมพันธ์ระหว่างการวางแผนภาษีกับราคาหลักทรัพย์ของบริษัทที่จดทะเบียนในตลาดหลักทรัพย์แห่งประเทศไทย กลุ่มเกษตรและอุตสาหกรรมอาหาร.pdf +3 -0
  8. files/ทิศทางความมั่นคงทางอวกาศของไทย.pdf +3 -0
  9. files/บทบาทคณะกรรมการกลางอิสลามแห่งประเทศไทยและอิทธิพลต่อการกำหนดนโยบายทางการเมืองของพรรคการเมือง.pdf +3 -0
  10. files/ผลกระทบจากการเปลี่ยนแปลงทางดิจิทัลในองค์กรต่อกระบวนการสอบบัญชี.pdf +3 -0
  11. main.py +48 -0
  12. rag.py +97 -0
  13. requirements.txt +12 -1
  14. storePDF.py +28 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04
2
+ RUN apt-get -y update && apt-get -y install software-properties-common && apt-get -y install curl && apt-get -y install build-essential \
3
+ && add-apt-repository -y ppa:deadsnakes/ppa && apt-get -y install python3.10 && apt-get -y install python3-pip
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
7
+
8
+ # Set home to the user's home directory
9
+ RUN useradd -m -u 1000 user
10
+ USER user
11
+ ENV HOME=/home/user \
12
+ PATH=/home/user/.local/bin:$PATH
13
+
14
+ WORKDIR $HOME/app
15
+
16
+ COPY --chown=user . $HOME/app
17
+
18
+ CMD ["sh", "-c", "python main.py & python app.py"]
app.py CHANGED
@@ -1,63 +1,58 @@
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
- ],
59
- )
60
-
61
 
62
  if __name__ == "__main__":
63
- demo.launch()
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain_core.messages import AIMessage, HumanMessage
3
+ from pydantic import BaseModel
4
+ import rag
5
+ import time
6
  import gradio as gr
7
+ import requests
8
+ from main import run_server
9
+
10
+ class ChatInput(BaseModel):
11
+ question: str
12
+
13
+ chat_history = []
14
+
15
+
16
+ def generate_response(chat_input: str, bot_message: str) -> str:
17
+ url = "http://127.0.0.1:8000/generatechat/"
18
+ payload = {
19
+ 'question': chat_input,
20
+ }
21
+ headers = {
22
+ 'Content-Type': 'application/json'
23
+ }
24
+
25
+ response = requests.post(url, json=payload, headers=headers)
26
+ if response.status_code == 200:
27
+ data = response.json()
28
+ answer = data['response']['answer']
29
+ print("Success:", response.json())
30
+
31
+ # Get a typewriting animation response
32
+ partial_response = ""
33
+ for char in answer:
34
+ partial_response += char
35
+ yield partial_response
36
+ time.sleep(0.005)
37
+ else:
38
+ print("Error:", response.status_code, response.text)
39
+ return f"Error: {response.status_code}, {response.text}"
40
+
41
+ with gr.Blocks() as demo:
42
+ with gr.Column():
43
+
44
+ chatbot = gr.ChatInterface(
45
+ fn=generate_response,
46
+ title="ThaiCodex Chat",
47
+ description="Ask questions based on the content of the uploaded or specified PDF.",
48
+ )
49
+
50
+ # with gr.Row():
51
+ # pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
52
+ # upload_button = gr.Button("Load PDF")
53
+ output_text = gr.Textbox(label="Status")
54
+ # upload_button.click(, inputs=[pdf_input], outputs=output_text)
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  if __name__ == "__main__":
57
+ demo.launch()
58
+ run_server() # uvicorn api
files/กระบวนการของบริษัทการบินไทย จำกัด (มหาชน) ในการฟื้นฟูกิจการ ภายใต้บริบทการเมืองไทย ในช่วงปี พ.ศ. 2562 - 2566.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b3e6b772982884be5014c7fb98c0ea21fe7410b5d90d4eb79b05ec926739359
3
+ size 352291
files/กลยุทธ์การหาเสียงของพรรคการเมืองกับแผนยุทธศาสตร์ชาติ 20 ปี.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:190ccc8f2ab2e762e1a56031ce14c182acefbfc0acbf20bd479391328114237d
3
+ size 859822
files/การนำองค์กรของผู้บริหารต่อประสิทธิผลตามพันธกิจของมหาวิทยาลัยเทคโนโลยีราชมงคลรัตนโกสินทร์.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bbd1699b39aa368d98e6d92f00dd87bd20c0c32a1b244b9ca25d2dd68b6f913
3
+ size 411260
files/ความสัมพันธ์ระหว่างการวางแผนภาษีกับราคาหลักทรัพย์ของบริษัทที่จดทะเบียนในตลาดหลักทรัพย์แห่งประเทศไทย กลุ่มเกษตรและอุตสาหกรรมอาหาร.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78174384ea62e77167261b25dcbc7b1d40091af9caf5309e878fb0062e5e566d
3
+ size 1152292
files/ทิศทางความมั่นคงทางอวกาศของไทย.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97efed2c7c096136832657debf39c108063228b46db86d1008e1d53e7b7f994c
3
+ size 215271
files/บทบาทคณะกรรมการกลางอิสลามแห่งประเทศไทยและอิทธิพลต่อการกำหนดนโยบายทางการเมืองของพรรคการเมือง.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:607ac8847ce3dcc4ca2697c874abae1bf8fde198489b7407eec9b3249375474e
3
+ size 362263
files/ผลกระทบจากการเปลี่ยนแปลงทางดิจิทัลในองค์กรต่อกระบวนการสอบบัญชี.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc504a7468769c1b02ef3566035a1652e06a580ebf7abf3146bbd7545ccef1d0
3
+ size 896227
main.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from langchain_core.messages import AIMessage, HumanMessage
4
+ from fastapi import FastAPI
5
+ from pydantic import BaseModel
6
+ import os
7
+ from rag import Rag
8
+ from storePDF import get_documents_from_path
9
+
10
+ folder_path = "files"
11
+ all_documents = get_documents_from_path(folder_path)
12
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
13
+ texts = text_splitter.split_documents(all_documents)
14
+
15
+ rag_llm = Rag();
16
+
17
+ rag_llm.createRagChain()
18
+
19
+ ##Chat API
20
+ chat_history = []
21
+
22
+ class ChatInput(BaseModel):
23
+ question: str
24
+
25
+ app = FastAPI()
26
+
27
+ @app.get("/")
28
+ async def root():
29
+ return {"message": "Hello World"}
30
+
31
+ @app.post("/generatechat/")
32
+ async def generateResponse(chat_input: ChatInput):
33
+ ai_msg = rag_llm.generateResponse(chat_input.question, chat_history)
34
+ chat_history.extend(
35
+ [
36
+ HumanMessage(content=chat_input.question),
37
+ AIMessage(content=ai_msg["answer"]),
38
+ ]
39
+ )
40
+ return {"response": ai_msg}
41
+
42
+ def run_server():
43
+ import uvicorn
44
+ uvicorn.run(app, host="127.0.0.1", port=8000)
45
+ print("Server is running")
46
+
47
+ if __name__ == "__main__":
48
+ run_server()
rag.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import create_retrieval_chain
2
+ from langchain.chains.combine_documents import create_stuff_documents_chain
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.prompts import MessagesPlaceholder
5
+ from langchain.chains import create_history_aware_retriever
6
+ from langchain_pinecone import PineconeVectorStore
7
+ from pinecone import Pinecone
8
+ from uuid import uuid4
9
+ import os
10
+ from langchain_huggingface import HuggingFaceEmbeddings
11
+ from langchain_openai import ChatOpenAI
12
+
13
+ class Rag:
14
+ def __init__(self):
15
+ self.embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
16
+ self.model = ChatOpenAI(
17
+ base_url='https://api.opentyphoon.ai/v1',
18
+ model='typhoon-v1.5-instruct',
19
+ api_key="sk-clKR9DG6C5K02OeHUBU927gbzXmTCydV9PjFaTBXfRVAJLKC",
20
+ )
21
+ self.system_prompt = (
22
+ """
23
+ You are a helpful librarian named ThaiCodex. A user has requested book recommendations.
24
+ We have retrieved {num_docs} document(s) based on the user's request, listed below:
25
+
26
+ {context}
27
+
28
+ Please list ALL and ONLY the books that were found above in the order they were retrieved.
29
+ For each book, provide:
30
+ 1. The Title.
31
+ 2. A brief Content.
32
+ 3. A reference to locate the book (e.g., a link, university, organization, or other relevant details).
33
+
34
+ Format your response as a numbered list, matching the order in which the documents were retrieved.
35
+
36
+ Results:
37
+ """
38
+ )
39
+ self.contextualize_q_system_prompt = (
40
+ "Given a chat history and the latest user question "
41
+ "which might reference context in the chat history, "
42
+ "formulate a standalone question which can be understood "
43
+ "without the chat history. Do NOT answer the question, "
44
+ "just reformulate it if needed and otherwise return it as is."
45
+ )
46
+
47
+ self.contextualize_q_prompt = ChatPromptTemplate.from_messages(
48
+ [
49
+ ("system", self.contextualize_q_system_prompt),
50
+ MessagesPlaceholder("chat_history"),
51
+ ("human", "{input}"),
52
+ ]
53
+ )
54
+ self.qa_prompt = ChatPromptTemplate.from_messages(
55
+ [
56
+ ("system", self.system_prompt),
57
+ MessagesPlaceholder("chat_history"),
58
+ ("human", "{input}"),
59
+ ]
60
+ )
61
+
62
+ if not os.getenv("PINECONE_API_KEY"):
63
+ os.environ["PINECONE_API_KEY"] = "ed681339-2270-4f85-b416-a372e857827b"
64
+ pinecone_api_key = os.environ.get("PINECONE_API_KEY")
65
+ pc = Pinecone(api_key=pinecone_api_key)
66
+
67
+ index_name = "thaicodex"
68
+ index = pc.Index(index_name)
69
+ self.vectorstore = PineconeVectorStore(index=index, embedding=self.embedding)
70
+
71
+ def storeDocumentsInVectorstore(self, documents):
72
+ uuids = [str(uuid4()) for _ in range(len(documents))]
73
+ self.vectorstore.add_documents(documents=documents, ids=uuids)
74
+
75
+ def createRagChain(self):
76
+ self.question_answer_chain = create_stuff_documents_chain(self.model, self.qa_prompt)
77
+ self.history_aware_retriever = create_history_aware_retriever(self.model, self.vectorstore.as_retriever(), self.contextualize_q_prompt)
78
+ self.rag_chain = create_retrieval_chain(self.history_aware_retriever, self.question_answer_chain)
79
+
80
+ def generateResponse(self, question, chat_history):
81
+ retrieved_docs = self.vectorstore.as_retriever().get_relevant_documents(question)
82
+ num_docs = len(retrieved_docs)
83
+
84
+ docs = "\n\n".join([
85
+ f"{i+1}. Title: {doc.metadata.get('source')}\nContent: {doc.page_content}"
86
+ for i, doc in enumerate(retrieved_docs)
87
+ ])
88
+ print(num_docs)
89
+ print(docs)
90
+ ai_msg = self.rag_chain.invoke({
91
+ "context": docs,
92
+ "num_docs": num_docs,
93
+ "input": question,
94
+ "chat_history": chat_history
95
+ })
96
+ return ai_msg
97
+
requirements.txt CHANGED
@@ -1 +1,12 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.2.14
2
+ langchain_community==0.2.12
3
+ langchain_ollama==0.1.1
4
+ langchain_groq
5
+ langchain-pinecone
6
+ langchain_huggingface
7
+ langchain_openai
8
+ pypdf
9
+ chromadb
10
+ ollama
11
+ fastapi
12
+ gradio
storePDF.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain.docstore.document import Document
3
+ import os
4
+ from rag import Rag
5
+
6
+ pdf_folder_path = 'files'
7
+
8
+ def get_documents_from_path(pdf_folder_path: str = pdf_folder_path) -> list:
9
+ documents = []
10
+ for pdf_file in os.listdir(pdf_folder_path):
11
+ if pdf_file.endswith('.pdf'):
12
+ loader = PyPDFLoader(os.path.join(pdf_folder_path, pdf_file))
13
+ pdf_documents = loader.load()
14
+ file_name_without_extension = os.path.splitext(pdf_file)[0]
15
+ for doc in pdf_documents:
16
+ documents.append(Document(page_content=doc.page_content, metadata={"source": file_name_without_extension}))
17
+
18
+ return documents
19
+
20
+ if __name__ == "__main__":
21
+ try:
22
+ rag_llm = Rag()
23
+ documents = get_documents_from_path()
24
+ rag_llm.storeDocumentsInVectorstore(documents)
25
+ print("Store PDFS Completed")
26
+
27
+ except Exception as e:
28
+ print(e)