heymenn commited on
Commit
dcf3971
·
verified ·
1 Parent(s): deda0a8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +218 -0
app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ from sentence_transformers import SentenceTransformer, util
5
+ import torch
6
+ from duckduckgo_search import DDGS
7
+ import re
8
+
9
+ # --- Configuration ---
10
+ CATEGORY_JSON_PATH = "categories.json"
11
+ TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
12
+ MODEL_NAME = 'all-MiniLM-L6-v2' # A good general-purpose sentence transformer
13
+ SIMILARITY_THRESHOLD = 0.3 # Adjust as needed
14
+ MAX_SEARCH_RESULTS_PER_TECH = 3
15
+
16
+ # --- Load Data and Model (Load once at startup) ---
17
+ print("Loading data and model...")
18
+ try:
19
+ # Load Categories
20
+ with open(CATEGORY_JSON_PATH, 'r') as f:
21
+ categories_data = json.load(f)["Category"]
22
+ # Prepare category texts for embedding (Category Name + Keywords)
23
+ category_names = list(categories_data.keys())
24
+ category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
25
+ print(f"Loaded {len(category_names)} categories.")
26
+
27
+ # Load Technologies
28
+ technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
29
+ # Clean the technology category column - handle potential NaN and ensure string type
30
+ technologies_df['category'] = technologies_df['category'].fillna('').astype(str)
31
+ print(f"Loaded {len(technologies_df)} technologies.")
32
+
33
+ # Load Sentence Transformer Model
34
+ model = SentenceTransformer(MODEL_NAME)
35
+ print(f"Loaded Sentence Transformer model: {MODEL_NAME}")
36
+
37
+ # Pre-compute category embeddings
38
+ print("Computing category embeddings...")
39
+ category_embeddings = model.encode(category_texts, convert_to_tensor=True)
40
+ print("Category embeddings computed.")
41
+
42
+ except FileNotFoundError as e:
43
+ print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
44
+ # Optionally raise the error or exit if critical files are missing
45
+ raise e
46
+ except Exception as e:
47
+ print(f"ERROR loading data or model: {e}")
48
+ raise e
49
+
50
+ # --- Helper Functions ---
51
+
52
+ def find_best_category(problem_description):
53
+ """
54
+ Finds the most relevant category for the problem description using semantic similarity.
55
+ """
56
+ if not problem_description or not category_names:
57
+ return None
58
+
59
+ try:
60
+ problem_embedding = model.encode(problem_description, convert_to_tensor=True)
61
+ # Compute cosine similarities
62
+ cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
63
+
64
+ # Find the highest score and its index
65
+ best_score, best_idx = torch.max(cosine_scores, dim=0)
66
+
67
+ if best_score.item() >= SIMILARITY_THRESHOLD:
68
+ return category_names[best_idx.item()], best_score.item()
69
+ else:
70
+ return None, None # No category met the threshold
71
+ except Exception as e:
72
+ print(f"Error during category finding: {e}")
73
+ return None, None
74
+
75
+ def find_relevant_technologies(category_name):
76
+ """
77
+ Filters the technologies DataFrame based on the identified category.
78
+ Handles categories listed like "Cat1, Cat2".
79
+ """
80
+ if not category_name or technologies_df.empty:
81
+ return pd.DataFrame() # Return empty DataFrame if no category or data
82
+
83
+ relevant_tech = []
84
+ # Iterate through the DataFrame safely
85
+ for index, row in technologies_df.iterrows():
86
+ # Split the 'category' string by comma and strip whitespace
87
+ tech_categories = [cat.strip() for cat in str(row['category']).split(',')]
88
+ if category_name in tech_categories:
89
+ relevant_tech.append(row)
90
+
91
+ if not relevant_tech:
92
+ return pd.DataFrame() # Return empty if no matches
93
+
94
+ return pd.DataFrame(relevant_tech)
95
+
96
+
97
+ def search_solutions(problem_description, technologies):
98
+ """
99
+ Searches DuckDuckGo for solutions combining the problem and technologies.
100
+ """
101
+ results = {}
102
+ if technologies.empty:
103
+ return "No relevant technologies found to search for solutions."
104
+
105
+ try:
106
+ with DDGS() as ddgs:
107
+ for tech_name in technologies['technology'].unique(): # Use unique names
108
+ # Clean up tech_name if it has extra info (like title prefixes)
109
+ # Simple cleaning - might need adjustment based on actual data
110
+ clean_tech_name = re.sub(r'^- Title\s*:\s*', '', str(tech_name)).strip()
111
+ if not clean_tech_name: continue # Skip if name is empty after cleaning
112
+
113
+ query = f'"{problem_description[:100]}" using "{clean_tech_name}" solution OR tutorial OR implementation' # Limit query length
114
+ print(f"Searching for: {query}")
115
+ search_results = []
116
+ for i, result in enumerate(ddgs.text(query, max_results=MAX_SEARCH_RESULTS_PER_TECH)):
117
+ search_results.append(result) # result is a dict {'title': ..., 'href': ..., 'body': ...}
118
+
119
+ if search_results:
120
+ results[clean_tech_name] = search_results
121
+ else:
122
+ results[clean_tech_name] = [] # Indicate no results found for this tech
123
+
124
+ except Exception as e:
125
+ print(f"Error during web search: {e}")
126
+ return f"An error occurred during the search: {e}"
127
+
128
+ # Format results for display
129
+ output = "### Potential Solutions & Resources:\n\n"
130
+ if not results:
131
+ output += "No search results found."
132
+ return output
133
+
134
+ for tech, links in results.items():
135
+ output += f"**For Technology: {tech}**\n"
136
+ if links:
137
+ for link in links:
138
+ output += f"- [{link['title']}]({link['href']})\n" #{link['body'][:100]}...\n" # Optionally add body snippet
139
+ else:
140
+ output += "- *No specific results found for this technology combination.*\n"
141
+ output += "\n"
142
+
143
+ return output
144
+
145
+ # --- Main Processing Function ---
146
+ def process_problem(problem_description):
147
+ """
148
+ Main function called by Gradio interface.
149
+ Orchestrates the categorization, technology finding, and solution searching.
150
+ """
151
+ if not problem_description:
152
+ return "Please enter a problem description.", "", ""
153
+
154
+ # 1. Categorize Problem
155
+ category_name, score = find_best_category(problem_description)
156
+ if category_name:
157
+ category_output = f"**Identified Category:** {category_name} (Similarity Score: {score:.2f})"
158
+ else:
159
+ category_output = "**Could not confidently identify a relevant category.**"
160
+ # Return early if no category is found? Or proceed with empty tech? Let's proceed for now.
161
+ # return category_output, "No category identified, cannot find technologies.", "No category identified, cannot search solutions."
162
+
163
+ # 2. Find Relevant Technologies
164
+ relevant_technologies_df = find_relevant_technologies(category_name) # Pass None if category not found
165
+ if not relevant_technologies_df.empty:
166
+ tech_output = "### Relevant Technologies:\n\n"
167
+ for _, row in relevant_technologies_df.iterrows():
168
+ # Clean up the description for better display
169
+ # Assuming description format like "- Title : ... \n - Purpose : ..."
170
+ desc_lines = str(row['description']).split('<br>') # Split by <br> if present
171
+ cleaned_desc = "\n".join([line.strip() for line in desc_lines if line.strip()])
172
+ tech_output += f"**Technology:** {row['technology']}\n**Description:**\n{cleaned_desc}\n\n---\n"
173
+ elif category_name:
174
+ tech_output = f"No specific technologies found listed under the '{category_name}' category in the provided data."
175
+ else:
176
+ tech_output = "No relevant technologies could be identified as no category was matched."
177
+
178
+
179
+ # 3. Search for Solutions
180
+ solution_output = search_solutions(problem_description, relevant_technologies_df)
181
+
182
+ # 4. Combine Outputs for Gradio
183
+ # Using Markdown for better formatting
184
+ final_output = f"## Analysis Results\n\n{category_output}\n\n{tech_output}\n\n{solution_output}"
185
+
186
+ # Gradio currently works best returning separate components if you define multiple outputs.
187
+ # Let's return a single formatted Markdown string for simplicity here.
188
+ # If you define 3 Markdown outputs in gr.Interface, you'd return: category_output, tech_output, solution_output
189
+ return final_output
190
+
191
+
192
+ # --- Create Gradio Interface ---
193
+ print("Setting up Gradio interface...")
194
+ iface = gr.Interface(
195
+ fn=process_problem,
196
+ inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here..."),
197
+ outputs=gr.Markdown(label="Analysis and Potential Solutions"), # Single Markdown output
198
+ # If using multiple outputs:
199
+ # outputs=[
200
+ # gr.Markdown(label="Identified Category"),
201
+ # gr.Markdown(label="Relevant Technologies"),
202
+ # gr.Markdown(label="Potential Solutions (Search Results)")
203
+ # ],
204
+ title="Technical Problem Analyzer",
205
+ description="Enter a technical problem. The application will attempt to categorize it, find relevant technologies from a predefined list, and search for potential online solutions using those technologies.",
206
+ examples=[
207
+ ["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
208
+ ["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
209
+ ["Develop a secure authentication method for a distributed IoT network without a central server."]
210
+ ],
211
+ allow_flagging='never', # Optional: disable flagging
212
+ # theme=gr.themes.Soft() # Optional: Apply a theme
213
+ )
214
+
215
+ # --- Launch the App ---
216
+ if __name__ == "__main__":
217
+ print("Launching Gradio app...")
218
+ iface.launch() # Share=True to create a public link (requires login on Hugging Face Spaces)