Ultronprime commited on
Commit
25e3a1a
·
verified ·
1 Parent(s): 7334e90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -47
app.py CHANGED
@@ -13,6 +13,7 @@ from sklearn.decomposition import PCA
13
  import numpy as np
14
  import plotly.graph_objects as go
15
  from sklearn.manifold import TSNE
 
16
 
17
  # --- User Configuration ---
18
  HF_USERNAME = os.getenv("HF_USERNAME")
@@ -29,56 +30,71 @@ if not API_TOKEN:
29
  def get_text_from_files(file_paths):
30
  all_text = []
31
  for filepath in file_paths:
32
- with open(filepath.name, "r", encoding="utf-8") as file:
33
- all_text.append(file.read())
 
 
 
34
  return all_text
35
 
36
  def get_embeddings(texts, model_id="sentence-transformers/all-mpnet-base-v2"):
37
- model = pipeline('feature-extraction', model=model_id, device="cuda")
38
- embeddings = model(texts)
 
 
 
 
39
  return embeddings
40
 
41
  def get_llm_response(query, context, model_id="HuggingFaceH4/zephyr-7b-beta"):
42
- tokenizer = AutoTokenizer.from_pretrained(model_id)
43
- model = pipeline("text-generation", model=model_id, device="cuda")
44
- prompt = f"""
45
- Answer the following question according to the provided context.
46
-
47
- Question: {query}
48
- Context: {context}
49
- Answer:
50
- """
51
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
52
-
53
- output = model(
54
- **inputs,
55
- max_new_tokens=250,
56
- do_sample=True,
57
- top_p=0.9,
58
- temperature=0.2,
59
- )
60
-
61
- return tokenizer.decode(output[0]["generated_text"], skip_special_tokens=True)
 
 
 
62
 
63
  def format_output(output):
64
  return output.strip()
65
 
66
  def fetch_from_store(query_embeddings, dataset_id):
67
- try:
 
68
  file_path = hf_hub_download(repo_id=dataset_id, filename="embeddings.json", repo_type="dataset", token=API_TOKEN)
69
- except:
70
- return "Couldn't find the embeddings on the Hub! Did you save them before?"
71
 
72
- with open(file_path, 'r') as f:
73
  dataset = json.load(f)
74
 
75
- all_similarities = []
76
- for text_embedding in dataset["embeddings"]:
77
- sim = torch.nn.functional.cosine_similarity(torch.tensor(query_embeddings), torch.tensor(text_embedding), dim=0)
78
- all_similarities.append(sim.item())
 
 
 
 
 
 
79
 
80
- most_similar_index = all_similarities.index(max(all_similarities))
81
- return dataset["texts"][most_similar_index]
82
 
83
  @space.GPU
84
  def rag_chain(question,files):
@@ -86,16 +102,27 @@ def rag_chain(question,files):
86
 
87
  if files is not None:
88
  texts = get_text_from_files(files)
89
- embeddings = get_embeddings(texts)
90
- upload_embeddings_to_hub(texts, embeddings, dataset_id=DATASET_ID)
 
 
 
 
 
91
 
92
  input_embedding = get_embeddings(texts=[question])
93
  # Get most relevant text:
94
- context = fetch_from_store(input_embedding[0], dataset_id=DATASET_ID)
95
- #Get the final output
96
- output = get_llm_response(question,context)
97
- return format_output(output)
98
-
 
 
 
 
 
 
99
 
100
  # --- Upload embedding to the Hub (only run one time) ---
101
  def upload_embeddings_to_hub(texts, embeddings, dataset_id):
@@ -103,9 +130,8 @@ def upload_embeddings_to_hub(texts, embeddings, dataset_id):
103
  try:
104
  create_repo(repo_id=dataset_id, repo_type="dataset", private=False)
105
  print(f"Dataset repo {dataset_id} created successfully!")
106
- except:
107
- print(f"Dataset repo {dataset_id} already exists.")
108
-
109
 
110
  dataset = {
111
  "texts": texts,
@@ -163,8 +189,8 @@ def visualize_data(files, dataset_id):
163
 
164
  try:
165
  file_path = hf_hub_download(repo_id=dataset_id, filename="embeddings.json", repo_type="dataset", token=API_TOKEN)
166
- except:
167
- return "Couldn't find the embeddings on the Hub! Did you save them before?", None, None
168
 
169
  with open(file_path, 'r') as f:
170
  dataset = json.load(f)
@@ -202,4 +228,6 @@ demo.launch(server_name="0.0.0.0")
202
 
203
  # --- Upload embeddings to Hub(one time execution)---
204
  # local_data_path = "data" # Please set this path to where your data is!
205
- # upload_embeddings_to_hub(local_data_path, dataset_id=DATASET_ID)
 
 
 
13
  import numpy as np
14
  import plotly.graph_objects as go
15
  from sklearn.manifold import TSNE
16
+ import traceback
17
 
18
  # --- User Configuration ---
19
  HF_USERNAME = os.getenv("HF_USERNAME")
 
30
  def get_text_from_files(file_paths):
31
  all_text = []
32
  for filepath in file_paths:
33
+ try:
34
+ with open(filepath.name, "r", encoding="utf-8") as file:
35
+ all_text.append(file.read())
36
+ except Exception as e:
37
+ print(f"Error reading file: {file.name} with error: {e}. Skipping file.")
38
  return all_text
39
 
40
  def get_embeddings(texts, model_id="sentence-transformers/all-mpnet-base-v2"):
41
+ try:
42
+ model = pipeline('feature-extraction', model=model_id, device="cuda")
43
+ embeddings = model(texts)
44
+ except Exception as e:
45
+ print(f"Error during embeddings: {e}. Please check your GPU configuration and model.")
46
+ return None
47
  return embeddings
48
 
49
  def get_llm_response(query, context, model_id="HuggingFaceH4/zephyr-7b-beta"):
50
+ try:
51
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
52
+ model = pipeline("text-generation", model=model_id, device="cuda")
53
+ prompt = f"""
54
+ Answer the following question according to the provided context.
55
+
56
+ Question: {query}
57
+ Context: {context}
58
+ Answer:
59
+ """
60
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
61
+ output = model(
62
+ **inputs,
63
+ max_new_tokens=250,
64
+ do_sample=True,
65
+ top_p=0.9,
66
+ temperature=0.2,
67
+ )
68
+ return tokenizer.decode(output[0]["generated_text"], skip_special_tokens=True)
69
+
70
+ except Exception as e:
71
+ print(f"Error during text generation {e}. Please check your settings")
72
+ return f"There was an error. Please check settings and if the models are available: {str(e)}"
73
 
74
  def format_output(output):
75
  return output.strip()
76
 
77
  def fetch_from_store(query_embeddings, dataset_id):
78
+
79
+ try:
80
  file_path = hf_hub_download(repo_id=dataset_id, filename="embeddings.json", repo_type="dataset", token=API_TOKEN)
81
+ except Exception as e:
82
+ return f"Couldn't find the embeddings on the Hub! Did you save them before? {str(e)}"
83
 
84
+ with open(file_path, 'r') as f:
85
  dataset = json.load(f)
86
 
87
+ all_similarities = []
88
+ for text_embedding in dataset["embeddings"]:
89
+ try:
90
+ sim = torch.nn.functional.cosine_similarity(torch.tensor(query_embeddings), torch.tensor(text_embedding), dim=0)
91
+ all_similarities.append(sim.item())
92
+ except Exception as e:
93
+ print (f"Error calculating similarity {e} skipping text entry")
94
+
95
+ most_similar_index = all_similarities.index(max(all_similarities))
96
+ return dataset["texts"][most_similar_index]
97
 
 
 
98
 
99
  @space.GPU
100
  def rag_chain(question,files):
 
102
 
103
  if files is not None:
104
  texts = get_text_from_files(files)
105
+ if texts:
106
+ embeddings = get_embeddings(texts)
107
+ if embeddings:
108
+ upload_embeddings_to_hub(texts, embeddings, dataset_id=DATASET_ID)
109
+ else:
110
+ return "There was an error uploading the dataset."
111
+
112
 
113
  input_embedding = get_embeddings(texts=[question])
114
  # Get most relevant text:
115
+ if input_embedding:
116
+ context = fetch_from_store(input_embedding[0], dataset_id=DATASET_ID)
117
+ if context:
118
+ #Get the final output
119
+ output = get_llm_response(question,context)
120
+ return format_output(output)
121
+ else:
122
+ return "There was an error. Couldn't fetch a correct context. Is there embeddings in the Hub?"
123
+ else:
124
+ return "There was an error generating the embeddings. Try again"
125
+
126
 
127
  # --- Upload embedding to the Hub (only run one time) ---
128
  def upload_embeddings_to_hub(texts, embeddings, dataset_id):
 
130
  try:
131
  create_repo(repo_id=dataset_id, repo_type="dataset", private=False)
132
  print(f"Dataset repo {dataset_id} created successfully!")
133
+ except Exception as e:
134
+ print(f"Dataset repo {dataset_id} already exists, {e}")
 
135
 
136
  dataset = {
137
  "texts": texts,
 
189
 
190
  try:
191
  file_path = hf_hub_download(repo_id=dataset_id, filename="embeddings.json", repo_type="dataset", token=API_TOKEN)
192
+ except Exception as e:
193
+ return f"Couldn't find the embeddings on the Hub! Did you save them before? {str(e)}", None, None
194
 
195
  with open(file_path, 'r') as f:
196
  dataset = json.load(f)
 
228
 
229
  # --- Upload embeddings to Hub(one time execution)---
230
  # local_data_path = "data" # Please set this path to where your data is!
231
+ # texts = get_text_from_files(os.listdir(local_data_path))
232
+ # embeddings = get_embeddings(texts)
233
+ # upload_embeddings_to_hub(texts, embeddings, dataset_id=DATASET_ID)