Spaces:

Ultronprime
/

Emails2go

Build error

App Files Files Community

Ultronprime commited on Feb 3, 2025

Commit

25e3a1a

verified ·

1 Parent(s): 7334e90

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -47

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from sklearn.decomposition import PCA
 import numpy as np
 import plotly.graph_objects as go
 from sklearn.manifold import TSNE
 # --- User Configuration ---
 HF_USERNAME = os.getenv("HF_USERNAME")
@@ -29,56 +30,71 @@ if not API_TOKEN:
 def get_text_from_files(file_paths):
     all_text = []
     for filepath in file_paths:
-      with open(filepath.name, "r", encoding="utf-8") as file:
-           all_text.append(file.read())
     return all_text
 def get_embeddings(texts, model_id="sentence-transformers/all-mpnet-base-v2"):
-    model = pipeline('feature-extraction', model=model_id, device="cuda")
-    embeddings = model(texts)
     return embeddings
 def get_llm_response(query, context, model_id="HuggingFaceH4/zephyr-7b-beta"):
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = pipeline("text-generation", model=model_id, device="cuda")
-    prompt = f"""
-        Answer the following question according to the provided context.
-        Question: {query}
-        Context: {context}
-        Answer:
-    """
-    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-    output = model(
-        **inputs,
-        max_new_tokens=250,
-        do_sample=True,
-        top_p=0.9,
-        temperature=0.2,
-    )
-    return tokenizer.decode(output[0]["generated_text"], skip_special_tokens=True)
 def format_output(output):
      return output.strip()
 def fetch_from_store(query_embeddings, dataset_id):
-    try:
         file_path = hf_hub_download(repo_id=dataset_id, filename="embeddings.json", repo_type="dataset", token=API_TOKEN)
-    except:
-         return "Couldn't find the embeddings on the Hub! Did you save them before?"
-    with open(file_path, 'r') as f:
         dataset = json.load(f)
-    all_similarities = []
-    for text_embedding in dataset["embeddings"]:
-        sim = torch.nn.functional.cosine_similarity(torch.tensor(query_embeddings), torch.tensor(text_embedding), dim=0)
-        all_similarities.append(sim.item())
-    most_similar_index = all_similarities.index(max(all_similarities))
-    return dataset["texts"][most_similar_index]
 @space.GPU
 def rag_chain(question,files):
@@ -86,16 +102,27 @@ def rag_chain(question,files):
     if files is not None:
       texts = get_text_from_files(files)
-      embeddings = get_embeddings(texts)
-      upload_embeddings_to_hub(texts, embeddings, dataset_id=DATASET_ID)
     input_embedding = get_embeddings(texts=[question])
     # Get most relevant text:
-    context = fetch_from_store(input_embedding[0], dataset_id=DATASET_ID)
-    #Get the final output
-    output = get_llm_response(question,context)
-    return format_output(output)
 # --- Upload embedding to the Hub (only run one time) ---
 def upload_embeddings_to_hub(texts, embeddings, dataset_id):
@@ -103,9 +130,8 @@ def upload_embeddings_to_hub(texts, embeddings, dataset_id):
     try:
         create_repo(repo_id=dataset_id, repo_type="dataset", private=False)
         print(f"Dataset repo {dataset_id} created successfully!")
-    except:
-       print(f"Dataset repo {dataset_id} already exists.")
     dataset = {
         "texts": texts,
@@ -163,8 +189,8 @@ def visualize_data(files, dataset_id):
     try:
        file_path = hf_hub_download(repo_id=dataset_id, filename="embeddings.json", repo_type="dataset", token=API_TOKEN)
-    except:
-       return "Couldn't find the embeddings on the Hub! Did you save them before?", None, None
     with open(file_path, 'r') as f:
          dataset = json.load(f)
@@ -202,4 +228,6 @@ demo.launch(server_name="0.0.0.0")
 # ---  Upload embeddings to Hub(one time execution)---
 # local_data_path = "data"  # Please set this path to where your data is!
-# upload_embeddings_to_hub(local_data_path, dataset_id=DATASET_ID)

 import numpy as np
 import plotly.graph_objects as go
 from sklearn.manifold import TSNE
+import traceback
 # --- User Configuration ---
 HF_USERNAME = os.getenv("HF_USERNAME")
 def get_text_from_files(file_paths):
     all_text = []
     for filepath in file_paths:
+        try:
+            with open(filepath.name, "r", encoding="utf-8") as file:
+                all_text.append(file.read())
+        except Exception as e:
+           print(f"Error reading file: {file.name} with error: {e}. Skipping file.")
     return all_text
 def get_embeddings(texts, model_id="sentence-transformers/all-mpnet-base-v2"):
+    try:
+      model = pipeline('feature-extraction', model=model_id, device="cuda")
+      embeddings = model(texts)
+    except Exception as e:
+        print(f"Error during embeddings: {e}. Please check your GPU configuration and model.")
+        return None
     return embeddings
 def get_llm_response(query, context, model_id="HuggingFaceH4/zephyr-7b-beta"):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = pipeline("text-generation", model=model_id, device="cuda")
+        prompt = f"""
+            Answer the following question according to the provided context.
+            Question: {query}
+            Context: {context}
+            Answer:
+        """
+        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+        output = model(
+            **inputs,
+            max_new_tokens=250,
+            do_sample=True,
+            top_p=0.9,
+            temperature=0.2,
+        )
+        return tokenizer.decode(output[0]["generated_text"], skip_special_tokens=True)
+    except Exception as e:
+        print(f"Error during text generation {e}. Please check your settings")
+        return f"There was an error. Please check settings and if the models are available: {str(e)}"
 def format_output(output):
      return output.strip()
 def fetch_from_store(query_embeddings, dataset_id):
+  try:
         file_path = hf_hub_download(repo_id=dataset_id, filename="embeddings.json", repo_type="dataset", token=API_TOKEN)
+  except Exception as e:
+       return f"Couldn't find the embeddings on the Hub! Did you save them before? {str(e)}"
+  with open(file_path, 'r') as f:
         dataset = json.load(f)
+  all_similarities = []
+  for text_embedding in dataset["embeddings"]:
+        try:
+          sim = torch.nn.functional.cosine_similarity(torch.tensor(query_embeddings), torch.tensor(text_embedding), dim=0)
+          all_similarities.append(sim.item())
+        except Exception as e:
+            print (f"Error calculating similarity {e} skipping text entry")
+  most_similar_index = all_similarities.index(max(all_similarities))
+  return dataset["texts"][most_similar_index]
 @space.GPU
 def rag_chain(question,files):
     if files is not None:
       texts = get_text_from_files(files)
+      if texts:
+        embeddings = get_embeddings(texts)
+        if embeddings:
+          upload_embeddings_to_hub(texts, embeddings, dataset_id=DATASET_ID)
+        else:
+           return "There was an error uploading the dataset."
     input_embedding = get_embeddings(texts=[question])
     # Get most relevant text:
+    if input_embedding:
+        context = fetch_from_store(input_embedding[0], dataset_id=DATASET_ID)
+        if context:
+        #Get the final output
+           output = get_llm_response(question,context)
+           return format_output(output)
+        else:
+           return "There was an error. Couldn't fetch a correct context. Is there embeddings in the Hub?"
+    else:
+        return "There was an error generating the embeddings. Try again"
 # --- Upload embedding to the Hub (only run one time) ---
 def upload_embeddings_to_hub(texts, embeddings, dataset_id):
     try:
         create_repo(repo_id=dataset_id, repo_type="dataset", private=False)
         print(f"Dataset repo {dataset_id} created successfully!")
+    except Exception as e:
+       print(f"Dataset repo {dataset_id} already exists, {e}")
     dataset = {
         "texts": texts,
     try:
        file_path = hf_hub_download(repo_id=dataset_id, filename="embeddings.json", repo_type="dataset", token=API_TOKEN)
+    except Exception as e:
+       return f"Couldn't find the embeddings on the Hub! Did you save them before? {str(e)}", None, None
     with open(file_path, 'r') as f:
          dataset = json.load(f)
 # ---  Upload embeddings to Hub(one time execution)---
 # local_data_path = "data"  # Please set this path to where your data is!
+# texts = get_text_from_files(os.listdir(local_data_path))
+# embeddings = get_embeddings(texts)
+# upload_embeddings_to_hub(texts, embeddings, dataset_id=DATASET_ID)