heymenn commited on
Commit
d6ed968
·
verified ·
1 Parent(s): fd5072d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -92
app.py CHANGED
@@ -3,15 +3,18 @@ import pandas as pd
3
  import json
4
  from sentence_transformers import SentenceTransformer, util
5
  import torch
6
- from duckduckgo_search import DDGS
7
  import re
 
8
 
9
  # --- Configuration ---
10
  CATEGORY_JSON_PATH = "categories.json"
11
  TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
12
  MODEL_NAME = 'all-MiniLM-L6-v2' # A good general-purpose sentence transformer
13
- SIMILARITY_THRESHOLD = 0.3 # Adjust as needed
14
- MAX_SEARCH_RESULTS_PER_TECH = 3
 
 
15
 
16
  # --- Load Data and Model (Load once at startup) ---
17
  print("Loading data and model...")
@@ -19,15 +22,15 @@ try:
19
  # Load Categories
20
  with open(CATEGORY_JSON_PATH, 'r') as f:
21
  categories_data = json.load(f)["Category"]
22
- # Prepare category texts for embedding (Category Name + Keywords)
23
  category_names = list(categories_data.keys())
24
  category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
25
  print(f"Loaded {len(category_names)} categories.")
26
 
27
  # Load Technologies
28
  technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
29
- # Clean the technology category column - handle potential NaN and ensure string type
30
  technologies_df['category'] = technologies_df['category'].fillna('').astype(str)
 
 
31
  print(f"Loaded {len(technologies_df)} technologies.")
32
 
33
  # Load Sentence Transformer Model
@@ -39,9 +42,14 @@ try:
39
  category_embeddings = model.encode(category_texts, convert_to_tensor=True)
40
  print("Category embeddings computed.")
41
 
 
 
 
 
 
 
42
  except FileNotFoundError as e:
43
  print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
44
- # Optionally raise the error or exit if critical files are missing
45
  raise e
46
  except Exception as e:
47
  print(f"ERROR loading data or model: {e}")
@@ -54,165 +62,221 @@ def find_best_category(problem_description):
54
  Finds the most relevant category for the problem description using semantic similarity.
55
  """
56
  if not problem_description or not category_names:
57
- return None
58
 
59
  try:
60
  problem_embedding = model.encode(problem_description, convert_to_tensor=True)
61
- # Compute cosine similarities
62
  cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
63
-
64
- # Find the highest score and its index
65
  best_score, best_idx = torch.max(cosine_scores, dim=0)
66
 
67
- if best_score.item() >= SIMILARITY_THRESHOLD:
68
  return category_names[best_idx.item()], best_score.item()
69
  else:
70
- return None, None # No category met the threshold
71
  except Exception as e:
72
  print(f"Error during category finding: {e}")
73
- return None, None
74
 
75
- def find_relevant_technologies(category_name):
76
  """
77
- Filters the technologies DataFrame based on the identified category.
78
- Handles categories listed like "Cat1, Cat2".
79
  """
80
- if not category_name or technologies_df.empty:
81
- return pd.DataFrame() # Return empty DataFrame if no category or data
 
 
 
 
82
 
83
- relevant_tech = []
84
- # Iterate through the DataFrame safely
85
- for index, row in technologies_df.iterrows():
86
- # Split the 'category' string by comma and strip whitespace
87
- tech_categories = [cat.strip() for cat in str(row['category']).split(',')]
88
- if category_name in tech_categories:
89
- relevant_tech.append(row)
 
 
 
 
90
 
91
- if not relevant_tech:
92
- return pd.DataFrame() # Return empty if no matches
93
 
94
- return pd.DataFrame(relevant_tech)
 
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- def search_solutions(problem_description, technologies):
 
98
  """
99
- Searches DuckDuckGo for solutions combining the problem and technologies.
100
  """
101
  results = {}
102
- if technologies.empty:
103
- return "No relevant technologies found to search for solutions."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- try:
106
- with DDGS() as ddgs:
107
- for tech_name in technologies['technology'].unique(): # Use unique names
108
- # Clean up tech_name if it has extra info (like title prefixes)
109
- # Simple cleaning - might need adjustment based on actual data
110
- clean_tech_name = re.sub(r'^- Title\s*:\s*', '', str(tech_name)).strip()
111
- if not clean_tech_name: continue # Skip if name is empty after cleaning
112
-
113
- query = f'"{problem_description[:100]}" using "{clean_tech_name}" solution OR tutorial OR implementation' # Limit query length
114
- print(f"Searching for: {query}")
115
- search_results = []
116
- for i, result in enumerate(ddgs.text(query, max_results=MAX_SEARCH_RESULTS_PER_TECH)):
117
- search_results.append(result) # result is a dict {'title': ..., 'href': ..., 'body': ...}
118
-
119
- if search_results:
120
- results[clean_tech_name] = search_results
121
- else:
122
- results[clean_tech_name] = [] # Indicate no results found for this tech
123
-
124
- except Exception as e:
125
- print(f"Error during web search: {e}")
126
- return f"An error occurred during the search: {e}"
127
 
128
  # Format results for display
129
- output = "### Potential Solutions & Resources:\n\n"
130
  if not results:
131
- output += "No search results found."
132
  return output
133
 
134
- for tech, links in results.items():
135
  output += f"**For Technology: {tech}**\n"
136
- if links:
137
- for link in links:
138
- output += f"- [{link['title']}]({link['href']})\n" #{link['body'][:100]}...\n" # Optionally add body snippet
139
- else:
140
- output += "- *No specific results found for this technology combination.*\n"
 
 
 
 
 
 
 
141
  output += "\n"
142
 
143
  return output
144
 
 
145
  # --- Main Processing Function ---
146
  def process_problem(problem_description):
147
  """
148
  Main function called by Gradio interface.
149
- Orchestrates the categorization, technology finding, and solution searching.
150
  """
151
  if not problem_description:
152
- return "Please enter a problem description.", "", ""
153
 
154
  # 1. Categorize Problem
155
  category_name, score = find_best_category(problem_description)
156
  if category_name:
157
  category_output = f"**Identified Category:** {category_name} (Similarity Score: {score:.2f})"
158
  else:
159
- category_output = "**Could not confidently identify a relevant category.**"
160
- # Return early if no category is found? Or proceed with empty tech? Let's proceed for now.
161
- # return category_output, "No category identified, cannot find technologies.", "No category identified, cannot search solutions."
 
 
162
 
163
- # 2. Find Relevant Technologies
164
- relevant_technologies_df = find_relevant_technologies(category_name) # Pass None if category not found
165
  if not relevant_technologies_df.empty:
166
- tech_output = "### Relevant Technologies:\n\n"
167
  for _, row in relevant_technologies_df.iterrows():
168
  # Clean up the description for better display
169
- # Assuming description format like "- Title : ... \n - Purpose : ..."
170
- desc_lines = str(row['description']).split('<br>') # Split by <br> if present
171
  cleaned_desc = "\n".join([line.strip() for line in desc_lines if line.strip()])
172
- tech_output += f"**Technology:** {row['technology']}\n**Description:**\n{cleaned_desc}\n\n---\n"
 
 
 
173
  elif category_name:
174
  tech_output = f"No specific technologies found listed under the '{category_name}' category in the provided data."
175
  else:
176
  tech_output = "No relevant technologies could be identified as no category was matched."
177
 
178
 
179
- # 3. Search for Solutions
180
- solution_output = search_solutions(problem_description, relevant_technologies_df)
181
 
182
  # 4. Combine Outputs for Gradio
183
- # Using Markdown for better formatting
184
  final_output = f"## Analysis Results\n\n{category_output}\n\n{tech_output}\n\n{solution_output}"
185
 
186
- # Gradio currently works best returning separate components if you define multiple outputs.
187
- # Let's return a single formatted Markdown string for simplicity here.
188
- # If you define 3 Markdown outputs in gr.Interface, you'd return: category_output, tech_output, solution_output
189
  return final_output
190
 
191
-
192
  # --- Create Gradio Interface ---
193
  print("Setting up Gradio interface...")
194
  iface = gr.Interface(
195
  fn=process_problem,
196
  inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here..."),
197
- outputs=gr.Markdown(label="Analysis and Potential Solutions"), # Single Markdown output
198
- # If using multiple outputs:
199
- # outputs=[
200
- # gr.Markdown(label="Identified Category"),
201
- # gr.Markdown(label="Relevant Technologies"),
202
- # gr.Markdown(label="Potential Solutions (Search Results)")
203
- # ],
204
- title="Technical Problem Analyzer",
205
- description="Enter a technical problem. The application will attempt to categorize it, find relevant technologies from a predefined list, and search for potential online solutions using those technologies.",
206
  examples=[
207
  ["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
208
  ["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
209
  ["Develop a secure authentication method for a distributed IoT network without a central server."]
210
  ],
211
- allow_flagging='never', # Optional: disable flagging
212
- # theme=gr.themes.Soft() # Optional: Apply a theme
213
  )
214
 
215
  # --- Launch the App ---
216
  if __name__ == "__main__":
217
  print("Launching Gradio app...")
218
- iface.launch() # Share=True to create a public link (requires login on Hugging Face Spaces)
 
3
  import json
4
  from sentence_transformers import SentenceTransformer, util
5
  import torch
6
+ import requests # Use requests for API calls
7
  import re
8
+ import urllib.parse # To encode URL parameters
9
 
10
  # --- Configuration ---
11
  CATEGORY_JSON_PATH = "categories.json"
12
  TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
13
  MODEL_NAME = 'all-MiniLM-L6-v2' # A good general-purpose sentence transformer
14
+ CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for matching category
15
+ MAX_TECHNOLOGIES_TO_SHOW = 8 # Enhancement 1: Limit displayed technologies
16
+ MAX_SEARCH_REFERENCES_PER_TECH = 3 # Max references from the search API
17
+ SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search" # Enhancement 3: New API endpoint
18
 
19
  # --- Load Data and Model (Load once at startup) ---
20
  print("Loading data and model...")
 
22
  # Load Categories
23
  with open(CATEGORY_JSON_PATH, 'r') as f:
24
  categories_data = json.load(f)["Category"]
 
25
  category_names = list(categories_data.keys())
26
  category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
27
  print(f"Loaded {len(category_names)} categories.")
28
 
29
  # Load Technologies
30
  technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
 
31
  technologies_df['category'] = technologies_df['category'].fillna('').astype(str)
32
+ # Pre-process description for embedding (use description column directly)
33
+ technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
34
  print(f"Loaded {len(technologies_df)} technologies.")
35
 
36
  # Load Sentence Transformer Model
 
42
  category_embeddings = model.encode(category_texts, convert_to_tensor=True)
43
  print("Category embeddings computed.")
44
 
45
+ # Pre-compute technology description embeddings (Optional but speeds up repeated calculations)
46
+ # print("Computing technology description embeddings...")
47
+ # technology_desc_embeddings = model.encode(technologies_df['description_clean'].tolist(), convert_to_tensor=True, show_progress_bar=True)
48
+ # print("Technology description embeddings computed.")
49
+ # NOTE: If pre-computing tech embeddings, adjust find_relevant_technologies to use them by index
50
+
51
  except FileNotFoundError as e:
52
  print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
 
53
  raise e
54
  except Exception as e:
55
  print(f"ERROR loading data or model: {e}")
 
62
  Finds the most relevant category for the problem description using semantic similarity.
63
  """
64
  if not problem_description or not category_names:
65
+ return None, 0.0
66
 
67
  try:
68
  problem_embedding = model.encode(problem_description, convert_to_tensor=True)
 
69
  cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
 
 
70
  best_score, best_idx = torch.max(cosine_scores, dim=0)
71
 
72
+ if best_score.item() >= CATEGORY_SIMILARITY_THRESHOLD:
73
  return category_names[best_idx.item()], best_score.item()
74
  else:
75
+ return None, best_score.item() # Return score even if below threshold
76
  except Exception as e:
77
  print(f"Error during category finding: {e}")
78
+ return None, 0.0
79
 
80
+ def find_relevant_technologies(category_name, problem_description):
81
  """
82
+ Filters technologies by category, calculates similarity with the problem,
83
+ sorts by similarity, and returns the top results.
84
  """
85
+ relevant_tech_data = [] # Store tuples of (row, similarity_score)
86
+ if not category_name or technologies_df.empty or not problem_description:
87
+ return pd.DataFrame() # Return empty DataFrame if no category, data, or problem description
88
+
89
+ try:
90
+ problem_embedding = model.encode(problem_description, convert_to_tensor=True)
91
 
92
+ # Filter by category first
93
+ for index, row in technologies_df.iterrows():
94
+ tech_categories = [cat.strip() for cat in str(row['category']).split(',')]
95
+ if category_name in tech_categories:
96
+ # Enhancement 2: Calculate similarity between problem and tech description
97
+ tech_desc = row['description_clean']
98
+ if tech_desc: # Only calculate if description exists
99
+ tech_embedding = model.encode(tech_desc, convert_to_tensor=True)
100
+ similarity_score = util.pytorch_cos_sim(problem_embedding, tech_embedding)[0][0].item()
101
+ else:
102
+ similarity_score = 0.0 # Assign 0 if no description
103
 
104
+ relevant_tech_data.append((row, similarity_score))
 
105
 
106
+ # Sort by similarity score (descending)
107
+ relevant_tech_data.sort(key=lambda item: item[1], reverse=True)
108
 
109
+ # Prepare DataFrame with sorted data and scores
110
+ if not relevant_tech_data:
111
+ return pd.DataFrame()
112
+
113
+ sorted_rows = [item[0] for item in relevant_tech_data]
114
+ scores = [item[1] for item in relevant_tech_data]
115
+
116
+ relevant_df = pd.DataFrame(sorted_rows)
117
+ relevant_df['similarity_score'] = scores # Add score column
118
+
119
+ # Enhancement 1: Limit the number of technologies shown
120
+ return relevant_df.head(MAX_TECHNOLOGIES_TO_SHOW)
121
+
122
+ except Exception as e:
123
+ print(f"Error during technology finding/scoring: {e}")
124
+ return pd.DataFrame() # Return empty on error
125
 
126
+
127
+ def search_solutions_api(problem_description, technologies):
128
  """
129
+ Enhancement 3: Searches for solutions using the specified API endpoint.
130
  """
131
  results = {}
132
+ if technologies.empty or not problem_description:
133
+ return "No relevant technologies found or problem description missing, cannot search for solutions."
134
+
135
+ headers = {'accept': 'application/json'}
136
+
137
+ for index, tech_row in technologies.iterrows():
138
+ tech_name = tech_row['technology']
139
+ # Clean up tech_name if it has extra info (like title prefixes)
140
+ clean_tech_name = re.sub(r'^- Title\s*:\s*', '', str(tech_name)).strip()
141
+ if not clean_tech_name: continue # Skip if name is empty
142
+
143
+ # Construct query for the API
144
+ query = f'"{problem_description[:100]}" using "{clean_tech_name}" solution OR tutorial OR implementation' # Keep query concise
145
+
146
+ # Prepare URL with encoded parameters
147
+ params = {
148
+ 'query': query,
149
+ 'max_references': MAX_SEARCH_REFERENCES_PER_TECH
150
+ }
151
+ encoded_params = urllib.parse.urlencode(params)
152
+ full_url = f"{SEARCH_API_URL}?{encoded_params}"
153
+
154
+ print(f"Calling API: POST {full_url}") # Log the call
155
+
156
+ try:
157
+ # Make the POST request (as per curl example, though query params in URL is GET-like)
158
+ response = requests.post(full_url, headers=headers, timeout=30) # Added timeout
159
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
160
+
161
+ # Assume the API returns JSON with a structure like:
162
+ # {'results': [{'title': '...', 'link': '...', 'snippet': '...'}, ...]}
163
+ # OR potentially just a list: [{'title': '...', 'link': '...', 'snippet': '...'}]
164
+ # Adjust parsing based on the *actual* API response structure
165
+ api_response = response.json()
166
+
167
+ # --- Adapt the following lines based on the API's actual JSON structure ---
168
+ search_results = []
169
+ if isinstance(api_response, list): # If the root is a list of results
170
+ search_results = api_response
171
+ elif isinstance(api_response, dict) and 'results' in api_response and isinstance(api_response['results'], list): # If it's a dict with a 'results' key
172
+ search_results = api_response['results']
173
+ else:
174
+ print(f"Warning: Unexpected API response format for tech '{clean_tech_name}'. Response: {api_response}")
175
+ # --- End of adaptation section ---
176
+
177
+ # Store results, ensuring keys like 'title' and 'link' exist
178
+ results[clean_tech_name] = [
179
+ {'title': r.get('title', 'N/A'), 'link': r.get('link', '#')}
180
+ for r in search_results if isinstance(r, dict) # Basic validation
181
+ ]
182
+
183
+ except requests.exceptions.RequestException as e:
184
+ print(f"Error calling search API for tech '{clean_tech_name}': {e}")
185
+ results[clean_tech_name] = f"API Error: {e}" # Store error message
186
+ except json.JSONDecodeError:
187
+ print(f"Error decoding JSON response for tech '{clean_tech_name}'. Status: {response.status_code}, Response text: {response.text[:200]}")
188
+ results[clean_tech_name] = "API Error: Invalid JSON response."
189
+ except Exception as e: # Catch other potential errors
190
+ print(f"Unexpected error during API call for tech '{clean_tech_name}': {e}")
191
+ results[clean_tech_name] = f"Unexpected Error: {e}"
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  # Format results for display
195
+ output = "### Potential Solutions & Resources (via API):\n\n"
196
  if not results:
197
+ output += "No search results could be retrieved from the API."
198
  return output
199
 
200
+ for tech, search_data in results.items():
201
  output += f"**For Technology: {tech}**\n"
202
+ if isinstance(search_data, list):
203
+ if search_data:
204
+ for link_info in search_data:
205
+ # Ensure link starts with http:// or https:// for Markdown link validity
206
+ href = link_info.get('link', '#')
207
+ if not href.startswith(('http://', 'https://')):
208
+ href = '#' # Default to '#' if link is invalid or missing protocol
209
+ output += f"- [{link_info.get('title', 'N/A')}]({href})\n"
210
+ else:
211
+ output += "- *No specific results found by the API for this technology combination.*\n"
212
+ else: # Handle cases where an error message was stored
213
+ output += f"- *Search failed: {search_data}*\n"
214
  output += "\n"
215
 
216
  return output
217
 
218
+
219
  # --- Main Processing Function ---
220
  def process_problem(problem_description):
221
  """
222
  Main function called by Gradio interface.
223
+ Orchestrates categorization, technology finding, and solution searching.
224
  """
225
  if not problem_description:
226
+ return "Please enter a problem description."
227
 
228
  # 1. Categorize Problem
229
  category_name, score = find_best_category(problem_description)
230
  if category_name:
231
  category_output = f"**Identified Category:** {category_name} (Similarity Score: {score:.2f})"
232
  else:
233
+ category_output = f"**Could not confidently identify a relevant category.** (Highest score: {score:.2f})"
234
+
235
+
236
+ # 2. Find Relevant Technologies (Pass problem description for similarity scoring)
237
+ relevant_technologies_df = find_relevant_technologies(category_name, problem_description)
238
 
 
 
239
  if not relevant_technologies_df.empty:
240
+ tech_output = f"### Relevant Technologies (Top {len(relevant_technologies_df)} based on relevance to problem):\n\n"
241
  for _, row in relevant_technologies_df.iterrows():
242
  # Clean up the description for better display
243
+ desc_lines = str(row['description']).split('<br>')
 
244
  cleaned_desc = "\n".join([line.strip() for line in desc_lines if line.strip()])
245
+ # Enhancement 2: Show similarity score
246
+ tech_output += f"**Technology:** {row['technology']}\n"
247
+ tech_output += f"**Relevance Score:** {row['similarity_score']:.2f}\n" # Display score
248
+ tech_output += f"**Description:**\n{cleaned_desc}\n\n---\n"
249
  elif category_name:
250
  tech_output = f"No specific technologies found listed under the '{category_name}' category in the provided data."
251
  else:
252
  tech_output = "No relevant technologies could be identified as no category was matched."
253
 
254
 
255
+ # 3. Search for Solutions (using the API)
256
+ solution_output = search_solutions_api(problem_description, relevant_technologies_df)
257
 
258
  # 4. Combine Outputs for Gradio
 
259
  final_output = f"## Analysis Results\n\n{category_output}\n\n{tech_output}\n\n{solution_output}"
260
 
 
 
 
261
  return final_output
262
 
 
263
  # --- Create Gradio Interface ---
264
  print("Setting up Gradio interface...")
265
  iface = gr.Interface(
266
  fn=process_problem,
267
  inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here..."),
268
+ outputs=gr.Markdown(label="Analysis and Potential Solutions"),
269
+ title="Technical Problem Analyzer v2",
270
+ description="Enter a technical problem. The application will attempt to categorize it, find relevant technologies (showing top matches with relevance scores), and search for potential online solutions using a dedicated API.",
 
 
 
 
 
 
271
  examples=[
272
  ["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
273
  ["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
274
  ["Develop a secure authentication method for a distributed IoT network without a central server."]
275
  ],
276
+ allow_flagging='never',
 
277
  )
278
 
279
  # --- Launch the App ---
280
  if __name__ == "__main__":
281
  print("Launching Gradio app...")
282
+ iface.launch()