Hasnain Ali commited on
Commit
70d4c53
·
1 Parent(s): fbe206f

update application file

Browse files
Files changed (1) hide show
  1. app.py +96 -4
app.py CHANGED
@@ -1,7 +1,99 @@
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ from torch import cuda, bfloat16
3
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
4
+ from langchain.document_loaders import HuggingFaceDatasetLoader
5
+ from langchain.vectorstores import Chroma
6
+ from langchain.llms import HuggingFacePipeline
7
+ from langchain.chains import RetrievalQA
8
+ from langchain.schema import AIMessage, HumanMessage
9
  import gradio as gr
10
 
 
 
11
 
12
+ embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
13
+
14
+ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
15
+
16
+ embed_model = HuggingFaceEmbeddings(
17
+ model_name=embed_model_id,
18
+ model_kwargs={'device': device},
19
+ encode_kwargs={'device': device, 'batch_size': 4}
20
+ )
21
+
22
+ dataset_name = "beinghasnain16/company-policies"
23
+ page_content_column = "chunk"
24
+ hf_auth = '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'
25
+
26
+ loader = HuggingFaceDatasetLoader(dataset_name, page_content_column, use_auth_token=hf_auth)
27
+
28
+ data = loader.load()
29
+
30
+ vectordb = Chroma.from_documents(data, embed_model)
31
+
32
+ model_id = 'meta-llama/Llama-2-13b-chat-hf'
33
+ # model_id = 'microsoft/phi-1_5'
34
+ # model_id = 'meta-llama/Llama-2-7b-chat-hf'
35
+
36
+ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
37
+
38
+ # set quantization configuration to load large model with less GPU memory
39
+ # this requires the `bitsandbytes` library
40
+ bnb_config = transformers.BitsAndBytesConfig(
41
+ load_in_4bit=True,
42
+ bnb_4bit_quant_type='nf4',
43
+ bnb_4bit_use_double_quant=True,
44
+ bnb_4bit_compute_dtype=bfloat16
45
+ )
46
+
47
+ # begin initializing HF items, need auth token for these
48
+
49
+ model_config = transformers.AutoConfig.from_pretrained(
50
+ model_id,
51
+ use_auth_token=hf_auth
52
+ )
53
+
54
+ model = transformers.AutoModelForCausalLM.from_pretrained(
55
+ model_id,
56
+ trust_remote_code=True,
57
+ config=model_config,
58
+ quantization_config=bnb_config,
59
+ device_map='auto',
60
+ use_auth_token=hf_auth
61
+ )
62
+ model.eval()
63
+ print(f"Model loaded on {device}")
64
+
65
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
66
+ model_id,
67
+ use_auth_token=hf_auth
68
+ )
69
+
70
+ generate_text = transformers.pipeline(
71
+ model=model, tokenizer=tokenizer,
72
+ return_full_text=True, # langchain expects the full text
73
+ task='text-generation',
74
+ # we pass model parameters here too
75
+ temperature=0.0, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
76
+ max_new_tokens=512, # mex number of tokens to generate in the output
77
+ repetition_penalty=1.1, # without this output begins repeating
78
+ )
79
+
80
+ res = generate_text("Explain to me the difference between nuclear fission and fusion.")
81
+ print(res[0]["generated_text"])
82
+
83
+ llm = HuggingFacePipeline(pipeline=generate_text)
84
+
85
+ rag_pipeline = RetrievalQA.from_chain_type(
86
+ llm=llm, chain_type='stuff',
87
+ retriever=vectordb.as_retriever()
88
+ )
89
+
90
+ def predict(message, history):
91
+ history_langchain_format = []
92
+ for human, ai in history:
93
+ history_langchain_format.append(HumanMessage(content=human))
94
+ history_langchain_format.append(AIMessage(content=ai))
95
+ history_langchain_format.append(HumanMessage(content=message))
96
+ llm_response = rag_pipeline(message)
97
+ return llm_response['result']
98
+
99
+ gr.ChatInterface(predict).launch()