Mike0307 commited on
Commit
c51ff3f
·
verified ·
1 Parent(s): 4bb5584

Update README.md

Browse files

Add Langchain RAG example

Files changed (1) hide show
  1. README.md +91 -6
README.md CHANGED
@@ -5,11 +5,11 @@ metrics:
5
  - spearmanr
6
  ---
7
 
8
- ### Overview
9
  This model is primarily designed for language understanding between Chinese texts.<br>
10
  It utilizes the **CoSENT** training framework for the purpose of the Retrieval-Augmented Generation (RAG) task.
11
 
12
- ### Download the model
13
 
14
  ```python
15
  from transformers import AutoTokenizer, AutoModel
@@ -17,7 +17,7 @@ tokenizer = AutoTokenizer.from_pretrained("Mike0307/text2vec-base-chinese-rag")
17
  model = AutoModel.from_pretrained("Mike0307/text2vec-base-chinese-rag")
18
  ```
19
 
20
- ### Example of similarity comparison
21
  ```python
22
  import torch
23
  def mean_pooling(model_output, attention_mask):
@@ -43,9 +43,9 @@ torch.cosine_similarity(embeddings[0], embeddings[1], dim=0)
43
 
44
  ```
45
 
46
- ### Example of Langchain Retriever
47
 
48
- RAG with langchain: https://python.langchain.com/v0.1/docs/use_cases/question_answering/
49
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/6414866f1cbd604c9217c7d0/RrBoHJINfrSWtCNkePs7g.png)
50
 
51
  Install the langchain packages
@@ -54,6 +54,8 @@ Install the langchain packages
54
  pip install --upgrade --quiet langchain langchain-community
55
  ```
56
 
 
 
57
  Download HuggingFace model through langchain_community
58
 
59
  ```python
@@ -77,6 +79,89 @@ documents = [
77
  Document(page_content="滾石國際音樂股份有限公司 Rock Records Co., Ltd. 曾用名 滾石雜誌社 滾石有聲出版社 公司類型 股份有限公司 統一編號 22012304 成立 1976年,滾石雜誌社 1980年,滾石有聲出版社 1986年1月28日(公司登記日期)(38年113天) 創辦人 段鍾沂、段鍾潭 代表人物 段鍾沂、段鍾潭 "),
78
  ]
79
  db = FAISS.from_documents(documents, embeddings)
80
- db.similarity_search("福井舞所屬哪家唱片公司?", k=1)
 
81
  # [Document(page_content='23歲時出道、血型A型的福井舞是出身於京都的日本女創作歌手,所屬唱片公司為J-more。2004年,與WADAGAKI、SHINO組合地下音樂隊Poplar,發表了兩張專輯,天照和夢死物語。在2006年時退出,2007年10月加入了Avex獨立發展。')]
82
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  - spearmanr
6
  ---
7
 
8
+ ## Overview
9
  This model is primarily designed for language understanding between Chinese texts.<br>
10
  It utilizes the **CoSENT** training framework for the purpose of the Retrieval-Augmented Generation (RAG) task.
11
 
12
+ ## Download the model
13
 
14
  ```python
15
  from transformers import AutoTokenizer, AutoModel
 
17
  model = AutoModel.from_pretrained("Mike0307/text2vec-base-chinese-rag")
18
  ```
19
 
20
+ ## Example of similarity comparison
21
  ```python
22
  import torch
23
  def mean_pooling(model_output, attention_mask):
 
43
 
44
  ```
45
 
46
+ ## Example of Langchain RAG
47
 
48
+ RAG with Langchain: https://python.langchain.com/v0.1/docs/use_cases/question_answering/
49
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/6414866f1cbd604c9217c7d0/RrBoHJINfrSWtCNkePs7g.png)
50
 
51
  Install the langchain packages
 
54
  pip install --upgrade --quiet langchain langchain-community
55
  ```
56
 
57
+ ### Use this embedding model to build a retiever
58
+
59
  Download HuggingFace model through langchain_community
60
 
61
  ```python
 
79
  Document(page_content="滾石國際音樂股份有限公司 Rock Records Co., Ltd. 曾用名 滾石雜誌社 滾石有聲出版社 公司類型 股份有限公司 統一編號 22012304 成立 1976年,滾石雜誌社 1980年,滾石有聲出版社 1986年1月28日(公司登記日期)(38年113天) 創辦人 段鍾沂、段鍾潭 代表人物 段鍾沂、段鍾潭 "),
80
  ]
81
  db = FAISS.from_documents(documents, embeddings)
82
+ retriever = db.as_retriever(search_kwargs = {"k" : 1})
83
+ retriever.invoke("福井舞所屬哪家唱片公司?")
84
  # [Document(page_content='23歲時出道、血型A型的福井舞是出身於京都的日本女創作歌手,所屬唱片公司為J-more。2004年,與WADAGAKI、SHINO組合地下音樂隊Poplar,發表了兩張專輯,天照和夢死物語。在2006年時退出,2007年10月加入了Avex獨立發展。')]
85
  ```
86
+
87
+ ### Use HuggingFace LLM as the langchain LLM
88
+
89
+ First, download the HuggingFace LLM via the bellow. Check this [repo](https://huggingface.co/Mike0307/Phi-3-mini-4k-instruct-chinese-lora) if you encounter any problems.
90
+
91
+ ```python
92
+ import torch
93
+ from transformers import AutoModelForCausalLM, AutoTokenizer
94
+
95
+ llm_id = "Mike0307/Phi-3-mini-4k-instruct-chinese-lora"
96
+ model = AutoModelForCausalLM.from_pretrained(
97
+ llm_id,
98
+ device_map="mps", # Change mps if not MacOS
99
+ torch_dtype=torch.float32, # try float16 for M1 chip
100
+ trust_remote_code=True,
101
+ attn_implementation="eager", # without flash_attn
102
+ )
103
+ tokenizer = AutoTokenizer.from_pretrained(llm_id)
104
+ ```
105
+
106
+ Second, construct a valid langchain LLM class using customized HuggingFace model.
107
+
108
+ ```python
109
+ import re
110
+ from pydantic import Field
111
+ from typing import Any, List, Optional
112
+ from langchain.prompts import PromptTemplate
113
+ from langchain.schema.runnable import RunnablePassthrough
114
+ from langchain_core.callbacks.manager import CallbackManagerForLLMRun
115
+ from langchain_core.language_models.llms import LLM
116
+
117
+ class CustomLLM(LLM):
118
+ model : Any = Field(..., description="The huggingface llm model")
119
+ tokenizer : Any = Field(..., description="The huggingface llm tokenizer.")
120
+ def __init__(self, model, tokenizer):
121
+ super().__init__(model = model, tokenizer = tokenizer)
122
+
123
+ def _call(self, prompt: str, stop: Optional[List[str]] = None,
124
+ run_manager: Optional[CallbackManagerForLLMRun] = None,**kwargs: Any,) -> str:
125
+ if stop is not None:
126
+ raise ValueError("stop kwargs are not permitted.")
127
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
128
+ outputs = self.model.generate(**inputs, temperature = 0.0, max_length = 500, do_sample = False)
129
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
130
+ return self.output_parser(generated_text)
131
+
132
+ @property
133
+ def _llm_type(self) -> str:
134
+ return "custom"
135
+
136
+ def output_parser(output):
137
+ pattern = "<\|assistant\|>(.*?)<\|endoftext\|>"
138
+ match = re.search(pattern, output, re.DOTALL)
139
+ if match:
140
+ return match.group(1).strip()
141
+ return output.strip()
142
+
143
+ ```
144
+
145
+ ### Make a simple RAG chain
146
+
147
+ Use `promt`, `llm`, `retriever` to build a simple RAG chain and try inference.
148
+
149
+ ```python
150
+ import langchain
151
+ langchain.debug = True # Check the chain process and validate the retrieved documents
152
+
153
+ prompt = PromptTemplate.from_template(template="<|user|>{documents}\n{question} <|end|>\n<|assistant|>")
154
+ llm = CustomLLM(model, tokenizer)
155
+ rag = {
156
+ "question" : RunnablePassthrough(),
157
+ "documents" : retriever
158
+ } | prompt | llm
159
+
160
+ ## example of inference
161
+ query = "埃及聖䴉是什麼?"
162
+ rag.invoke(query)
163
+ ## '埃及聖䴉是一種埃及的朱鷺,它在埃及備受尊敬,經常被製成木乃伊當做托特的象徵。它也被引入到法國、義大利、西班牙及美國。現在,在臺灣西部濱海地區也可看到埃及聖䴉。'
164
+
165
+ ```
166
+
167
+