Frenchizer commited on
Commit
c1151ec
·
verified ·
1 Parent(s): e47e728

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -106
app.py CHANGED
@@ -3,13 +3,14 @@ from transformers import AutoTokenizer, AutoModel
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  import torch
5
  import numpy as np
6
- from gradio_client import Client
 
7
  from functools import lru_cache
8
 
9
  # Cache the model and tokenizer using lru_cache
10
  @lru_cache(maxsize=1)
11
  def load_model_and_tokenizer():
12
- model_name = "./all-MiniLM-L6-v2" # Replace with your Space and model path
13
  tokenizer = AutoTokenizer.from_pretrained(model_name)
14
  model = AutoModel.from_pretrained(model_name)
15
  return tokenizer, model
@@ -19,117 +20,77 @@ tokenizer, model = load_model_and_tokenizer()
19
 
20
  # Precompute label embeddings
21
  labels = [
22
- "aerospace", "anatomy", "anthropology", "art",
23
- "automotive", "blockchain", "biology", "chemistry",
24
- "cryptocurrency", "data science", "design", "e-commerce",
25
- "education", "engineering", "entertainment", "environment",
26
- "fashion", "finance", "food commerce", "gaming",
27
- "healthcare", "history", "information technology",
28
- "legal", "machine learning", "marketing", "medicine",
29
- "music", "philosophy", "physics", "politics", "real estate", "retail",
30
- "robotics", "social media", "sports", "technical",
31
- "tourism", "travel"
32
  ]
33
 
34
  tones = [
35
  "formal", "positive", "negative", "poetic", "polite", "subtle", "casual", "neutral",
36
- "informal", "pompous", "sustained", "rude", "sustained",
37
- "sophisticated", "playful", "serious", "friendly"
38
  ]
39
 
40
- styles = [
41
- "poetry", "novel", "theater", "slang", "speech", "keywords", "html", "programming"
42
- ]
43
-
44
- gender_number = [
45
- "masculine singular", "masculine plural", "feminine singular", "feminine plural"
46
- ]
47
-
48
- @lru_cache(maxsize=1)
49
- def precompute_label_embeddings():
50
- inputs = tokenizer(labels, padding=True, truncation=True, return_tensors="pt")
51
  with torch.no_grad():
52
- outputs = model(**inputs)
53
- return outputs.last_hidden_state.mean(dim=1).numpy() # Mean pooling for embeddings
54
-
55
- label_embeddings = precompute_label_embeddings()
56
 
57
- # Softmax function to convert scores to probabilities
58
- def softmax(x):
59
- exp_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
60
- return exp_x / exp_x.sum()
61
 
62
- # Function to detect context
63
- def detect_context(input_text, threshold=0.03):
64
- # Encode the input text
65
- inputs = tokenizer([input_text], padding=True, truncation=True, return_tensors="pt")
66
  with torch.no_grad():
67
- outputs = model(**inputs)
68
- input_embedding = outputs.last_hidden_state.mean(dim=1).numpy() # Mean pooling for embedding
69
-
70
- # Compute cosine similarities
71
- similarities = cosine_similarity(input_embedding, label_embeddings)[0]
72
-
73
- # Apply softmax to convert similarities to probabilities
74
- probabilities = softmax(similarities)
75
-
76
- # Pair each label with its probability
77
- label_probabilities = list(zip(labels, probabilities))
78
-
79
- # Filter contexts with confidence >= threshold
80
- high_confidence_contexts = [(label, score) for label, score in label_probabilities if score >= threshold]
81
-
82
- # If no contexts meet the threshold, default to "general"
83
- if not high_confidence_contexts:
84
- high_confidence_contexts = [("general", 1.0)] # Assign a default score of 1.0 for "general"
85
-
86
- return high_confidence_contexts
87
-
88
- # Mock translation clients for different contexts
89
- def get_translation_client(context):
90
- """
91
- Returns the appropriate Hugging Face Space client for the given context.
92
- For now, all contexts use the same mock space.
93
- """
94
- return Client("Frenchizer/space_7") # Replace with actual Space paths for each context
95
-
96
- def translate_text(input_text, context):
97
- """
98
- Translates the input text using the appropriate model for the given context.
99
- """
100
- client = get_translation_client(context)
101
- return client.predict(input_text)
102
-
103
- def process_request(input_text):
104
- # Step 1: Detect context
105
- context_results = detect_context(input_text)
106
-
107
- # Step 2: Translate the text for each context
108
- translations = {}
109
- for context, score in context_results:
110
- translations[context] = translate_text(input_text, context)
111
-
112
- # Step 3: Print the list of high-confidence contexts and translations
113
- print("High-confidence contexts (score >= 0.022):", context_results)
114
- print("Translations:", translations)
115
-
116
- # Return the translations and contexts
117
- return translations, context_results
118
-
119
- # Gradio interface
120
- def gradio_interface(input_text):
121
- translation, contexts = process_request(input_text)
122
- # Format the output
123
- output = f"{translation}\n"
124
- return output.strip()
125
-
126
- # Create the Gradio interface
127
- interface = gr.Interface(
128
- fn=gradio_interface,
129
- inputs="text",
130
- outputs="text",
131
- title="Frenchizer",
132
- description="Translate text from English to French with context detection and threshold."
133
- )
134
-
135
- interface.launch()
 
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  import torch
5
  import numpy as np
6
+ import json
7
+ import requests
8
  from functools import lru_cache
9
 
10
  # Cache the model and tokenizer using lru_cache
11
  @lru_cache(maxsize=1)
12
  def load_model_and_tokenizer():
13
+ model_name = "./all-MiniLM-L6-v2" # Adjust if needed
14
  tokenizer = AutoTokenizer.from_pretrained(model_name)
15
  model = AutoModel.from_pretrained(model_name)
16
  return tokenizer, model
 
20
 
21
  # Precompute label embeddings
22
  labels = [
23
+ "aerospace", "anatomy", "anthropology", "art", "automotive", "blockchain",
24
+ "biology", "chemistry", "cryptocurrency", "data science", "design", "e-commerce",
25
+ "education", "engineering", "entertainment", "environment", "fashion", "finance",
26
+ "food commerce", "gaming", "healthcare", "history", "information technology",
27
+ "legal", "machine learning", "marketing", "medicine", "music", "philosophy",
28
+ "physics", "politics", "real estate", "retail", "robotics", "social media",
29
+ "sports", "technical", "tourism", "travel"
 
 
 
30
  ]
31
 
32
  tones = [
33
  "formal", "positive", "negative", "poetic", "polite", "subtle", "casual", "neutral",
34
+ "informal", "pompous", "sustained", "rude"
 
35
  ]
36
 
37
+ # Compute label embeddings
38
+ def get_label_embeddings():
 
 
 
 
 
 
 
 
 
39
  with torch.no_grad():
40
+ tokenized = tokenizer(labels, padding=True, truncation=True, return_tensors="pt")
41
+ label_embeddings = model(**tokenized).last_hidden_state[:, 0, :].numpy()
42
+ return label_embeddings
 
43
 
44
+ label_embeddings = get_label_embeddings()
 
 
 
45
 
46
+ def detect_context(text: str):
47
+ # Encode input text
48
+ tokenized = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
 
49
  with torch.no_grad():
50
+ text_embedding = model(**tokenized).last_hidden_state[:, 0, :].numpy()
51
+
52
+ # Compute similarity scores
53
+ similarities = cosine_similarity(text_embedding, label_embeddings)[0]
54
+
55
+ # Get best matching context
56
+ best_index = np.argmax(similarities)
57
+ detected_context = labels[best_index]
58
+
59
+ return detected_context
60
+
61
+ def process_and_translate(text: str):
62
+ detected_context = detect_context(text)
63
+
64
+ try:
65
+ # Forward text and detected context to space_7 for translation
66
+ translation_response = requests.post(
67
+ "https://api.gradio.app/v2/Frenchizer/space_7/predict",
68
+ json={"data": [text, detected_context]}
69
+ ).json()
70
+
71
+ if "error" in translation_response:
72
+ return json.dumps({
73
+ "error": "Translation failed",
74
+ "context": detected_context
75
+ })
76
+
77
+ return json.dumps({
78
+ "context": detected_context,
79
+ "translation": translation_response["data"][0]
80
+ })
81
+
82
+ except Exception as e:
83
+ return json.dumps({
84
+ "error": str(e),
85
+ "context": detected_context
86
+ })
87
+
88
+ # Define Gradio interface
89
+ with gr.Blocks() as interface:
90
+ input_text = gr.Textbox(label="Input Text")
91
+ output_json = gr.JSON(label="Context & Translation")
92
+ process_button = gr.Button("Process & Translate")
93
+ process_button.click(fn=process_and_translate, inputs=[input_text], outputs=[output_json])
94
+
95
+ if __name__ == "__main__":
96
+ interface.launch()