heymenn commited on
Commit
d2f7eb8
·
verified ·
1 Parent(s): 53aa5e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -244
app.py CHANGED
@@ -1,247 +1,9 @@
1
- import pandas as pd
2
- import gradio as gr
3
- from fastapi import FastAPI, HTTPException
4
- from pydantic import BaseModel
5
- from sentence_transformers import SentenceTransformer
6
- from sklearn.metrics.pairwise import cosine_similarity
7
- import numpy as np
8
  import os
9
- from typing import List, Tuple
10
 
11
- # --- Configuration ---
12
- EXCEL_FILE_PATH = 'technologies.xlsx' # Ensure this file is in the same directory as app.py
13
- MODEL_NAME = 'all-MiniLM-L6-v2' # Efficient and good general-purpose model
14
 
15
- # --- Data Loading and Preprocessing ---
16
- technologies_df = None
17
- embeddings = None
18
- model = None
19
-
20
- def load_data_and_model():
21
- """Loads the Excel data and the sentence transformer model."""
22
- global technologies_df, embeddings, model
23
- try:
24
- # Check if the Excel file exists
25
- if not os.path.exists(EXCEL_FILE_PATH):
26
- raise FileNotFoundError(f"Error: The file '{EXCEL_FILE_PATH}' was not found.")
27
-
28
- # Load data from Excel
29
- technologies_df = pd.read_excel(EXCEL_FILE_PATH)
30
-
31
- # Validate necessary columns
32
- if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns:
33
- raise ValueError("Excel file must contain 'technology' and 'description' columns.")
34
-
35
- # Handle potential missing descriptions (fill with empty string or drop)
36
- technologies_df['description'] = technologies_df['description'].fillna('')
37
- descriptions = technologies_df['description'].tolist()
38
-
39
- # Load the sentence transformer model
40
- print(f"Loading sentence transformer model: {MODEL_NAME}...")
41
- model = SentenceTransformer(MODEL_NAME)
42
- print("Model loaded.")
43
-
44
- # Generate embeddings for all technology descriptions
45
- print("Generating embeddings for technology descriptions...")
46
- embeddings = model.encode(descriptions, show_progress_bar=False) # Disable progress bar for Spaces
47
- print("Embeddings generated.")
48
-
49
- except FileNotFoundError as e:
50
- print(e)
51
- raise gr.Error(f"Error loading data: {e}")
52
- except ValueError as e:
53
- print(f"Data validation error: {e}")
54
- raise gr.Error(f"Data validation error: {e}")
55
- except Exception as e:
56
- print(f"An unexpected error occurred during data loading: {e}")
57
- raise gr.Error(f"An unexpected error occurred during data loading: {e}")
58
-
59
- # --- Helper Function ---
60
- def get_top_10_tech(problem_description: str) -> List[Tuple[int, float]]:
61
- """
62
- Finds the top 10 technologies based on cosine similarity to the problem description.
63
-
64
- Args:
65
- problem_description: The technical problem described by the user.
66
-
67
- Returns:
68
- A list of tuples, where each tuple contains the index of the technology
69
- in the original DataFrame and its similarity score, sorted by score descending.
70
- Returns an empty list if embeddings are not ready.
71
- """
72
- if model is None or embeddings is None or technologies_df is None:
73
- raise gr.Error("Server not ready, embeddings not loaded.")
74
-
75
- # Generate embedding for the input problem
76
- problem_embedding = model.encode([problem_description]) # Pass as a list
77
-
78
- # Calculate cosine similarity between the problem and all tech descriptions
79
- # Reshape problem_embedding to 2D array for cosine_similarity
80
- similarities = cosine_similarity(problem_embedding, embeddings)[0] # Get the first (and only) row
81
-
82
- # Get indices of top 10 similarities
83
- # argsort returns indices that would sort the array in ascending order
84
- # We use [-10:] to get the indices of the 10 largest values
85
- # Then we reverse it `[::-1]` to have the highest similarity first
86
- top_10_indices = np.argsort(similarities)[-10:][::-1]
87
-
88
- # Create list of (index, score) tuples for the top 10
89
- top_10_with_scores = [(idx, similarities[idx]) for idx in top_10_indices]
90
-
91
- return top_10_with_scores
92
-
93
- # --- Gradio Interface Functions ---
94
- def predict(problem_description: str):
95
- """Gradio function to get the top 2 most similar technologies."""
96
- try:
97
- top_10_results = get_top_10_tech(problem_description)
98
- if not top_10_results:
99
- return "No matching technologies found."
100
- top_indices = [idx for idx, _ in top_10_results[:2]]
101
- result_df = technologies_df.iloc[top_indices]
102
- results = []
103
- for _, row in result_df.iterrows():
104
- results.append(f"**Technology:** {row['technology']}\n**Description:** {row['description']}")
105
- return "\n\n".join(results)
106
- except gr.Error as e:
107
- return str(e)
108
- except Exception as e:
109
- print(f"Error in prediction: {e}")
110
- return "An error occurred while processing your request."
111
-
112
- def predict_worst(problem_description: str):
113
- """Gradio function to get the bottom 2 least similar technologies from the top 10."""
114
- try:
115
- top_10_results = get_top_10_tech(problem_description)
116
- if len(top_10_results) < 2:
117
- return "Not enough matching technologies to find the least similar."
118
- bottom_indices = [idx for idx, _ in top_10_results[-2:]]
119
- result_df = technologies_df.iloc[bottom_indices]
120
- results = []
121
- for _, row in result_df.iterrows():
122
- results.append(f"**Technology:** {row['technology']}\n**Description:** {row['description']}")
123
- return "\n\n".join(results)
124
- except gr.Error as e:
125
- return str(e)
126
- except Exception as e:
127
- print(f"Error in predict_worst: {e}")
128
- return "An error occurred while processing your request."
129
-
130
- def predict_most_similar_pairs(problem_description: str):
131
- """Gradio function to get the two most similar pairs of technologies from the top 10."""
132
- try:
133
- top_10_results = get_top_10_tech(problem_description)
134
- if len(top_10_results) < 2:
135
- return "Not enough matching technologies to form pairs."
136
-
137
- top_10_indices = [idx for idx, _ in top_10_results]
138
- top_10_embeddings = embeddings[top_10_indices]
139
- top_10_df = technologies_df.iloc[top_10_indices].reset_index(drop=True)
140
-
141
- pairwise_similarities = cosine_similarity(top_10_embeddings)
142
- pairs = []
143
- for i in range(len(top_10_df)):
144
- for j in range(i + 1, len(top_10_df)):
145
- score = pairwise_similarities[i, j]
146
- pairs.append(((i, top_10_df['technology'][i], top_10_df['description'][i]),
147
- (j, top_10_df['technology'][j], top_10_df['description'][j]),
148
- score))
149
-
150
- sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)
151
- top_2_pairs_output = []
152
- num_pairs_to_return = min(2, len(sorted_pairs))
153
- for i in range(num_pairs_to_return):
154
- (idx1, tech1, desc1), (idx2, tech2, desc2), score = sorted_pairs[i]
155
- top_2_pairs_output.append(f"**Pair {i+1}:**\n"
156
- f"**Technology 1:** {tech1}\nDescription: {desc1}\n"
157
- f"**Technology 2:** {tech2}\nDescription: {desc2}\n"
158
- f"**Similarity Score:** {score:.4f}\n\n")
159
- return "\n".join(top_2_pairs_output) if top_2_pairs_output else "No similar pairs found."
160
- except gr.Error as e:
161
- return str(e)
162
- except Exception as e:
163
- print(f"Error in predict_most_similar_pairs: {e}")
164
- return "An error occurred while processing the request for similar pairs."
165
-
166
- def predict_least_similar_pairs(problem_description: str):
167
- """Gradio function to get the two least similar pairs of technologies from the top 10."""
168
- try:
169
- top_10_results = get_top_10_tech(problem_description)
170
- if len(top_10_results) < 2:
171
- return "Not enough matching technologies to form pairs."
172
-
173
- top_10_indices = [idx for idx, _ in top_10_results]
174
- top_10_embeddings = embeddings[top_10_indices]
175
- top_10_df = technologies_df.iloc[top_10_indices].reset_index(drop=True)
176
-
177
- pairwise_similarities = cosine_similarity(top_10_embeddings)
178
- pairs = []
179
- for i in range(len(top_10_df)):
180
- for j in range(i + 1, len(top_10_df)):
181
- score = pairwise_similarities[i, j]
182
- pairs.append(((i, top_10_df['technology'][i], top_10_df['description'][i]),
183
- (j, top_10_df['technology'][j], top_10_df['description'][j]),
184
- score))
185
-
186
- sorted_pairs = sorted(pairs, key=lambda x: x[2])
187
- bottom_2_pairs_output = []
188
- num_pairs_to_return = min(2, len(sorted_pairs))
189
- for i in range(num_pairs_to_return):
190
- (idx1, tech1, desc1), (idx2, tech2, desc2), score = sorted_pairs[i]
191
- bottom_2_pairs_output.append(f"**Pair {i+1}:**\n"
192
- f"**Technology 1:** {tech1}\nDescription: {desc1}\n"
193
- f"**Technology 2:** {tech2}\nDescription: {desc2}\n"
194
- f"**Similarity Score:** {score:.4f}\n\n")
195
- return "\n".join(bottom_2_pairs_output) if bottom_2_pairs_output else "No pairs found."
196
- except gr.Error as e:
197
- return str(e)
198
- except Exception as e:
199
- print(f"Error in predict_least_similar_pairs: {e}")
200
- return "An error occurred while processing the request for least similar pairs."
201
-
202
- # --- Gradio Interface ---
203
- iface = gr.Interface(
204
- fn=predict,
205
- inputs=gr.Textbox(label="Enter a technical problem description"),
206
- outputs=gr.Textbox(label="Top 2 Most Similar Technologies"),
207
- title="Technology Recommender",
208
- description="Enter a description of a technical problem to find the top 2 most relevant technologies.",
209
- examples=["Troubleshooting network connectivity issues", "Need a database for a small web application"]
210
- )
211
-
212
- iface_worst = gr.Interface(
213
- fn=predict_worst,
214
- inputs=gr.Textbox(label="Enter a technical problem description"),
215
- outputs=gr.Textbox(label="Bottom 2 Least Similar Technologies (from Top 10)"),
216
- title="Find Least Similar Technologies",
217
- description="Enter a description of a technical problem to find the bottom 2 least relevant technologies from the top 10 matches.",
218
- examples=["Scaling a microservices architecture", "Implementing a new UI framework"]
219
- )
220
-
221
- iface_mixing_max = gr.Interface(
222
- fn=predict_most_similar_pairs,
223
- inputs=gr.Textbox(label="Enter a technical problem description"),
224
- outputs=gr.Textbox(label="Top 2 Most Similar Pairs of Technologies (from Top 10)"),
225
- title="Find Most Similar Technology Pairs",
226
- description="Enter a description of a technical problem to find the top 2 most similar pairs of technologies among the top 10 matches.",
227
- examples=["Data analysis pipeline", "Machine learning model deployment"]
228
- )
229
-
230
- iface_mixing_min = gr.Interface(
231
- fn=predict_least_similar_pairs,
232
- inputs=gr.Textbox(label="Enter a technical problem description"),
233
- outputs=gr.Textbox(label="Top 2 Least Similar Pairs of Technologies (from Top 10)"),
234
- title="Find Least Similar Technology Pairs",
235
- description="Enter a description of a technical problem to find the top 2 least similar pairs of technologies among the top 10 matches.",
236
- examples=["Frontend development", "Backend database design"]
237
- )
238
-
239
- # Combine interfaces into a TabbedInterface
240
- tabbed_interface = gr.TabbedInterface([iface, iface_worst, iface_mixing_max, iface_mixing_min],
241
- ["Find Most Similar", "Find Least Similar", "Most Similar Pairs", "Least Similar Pairs"])
242
-
243
- # Load data and model on startup
244
- load_data_and_model()
245
-
246
- # Launch the Gradio interface
247
- tabbed_interface.launch()
 
1
+ import subprocess
 
 
 
 
 
 
2
  import os
 
3
 
4
+ # Install dependencies
5
+ if os.path.exists("requirements.txt"):
6
+ subprocess.run(["pip", "install", "-r", "requirements.txt"])
7
 
8
+ # Run FastAPI with Uvicorn
9
+ subprocess.run(["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"])