USER-GNEXUSES commited on
Commit
e7597b4
·
verified ·
1 Parent(s): 22a8e59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +826 -61
app.py CHANGED
@@ -1,69 +1,834 @@
1
- from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
2
- import datetime
3
- import requests
4
- import pytz
5
- import yaml
6
- from tools.final_answer import FinalAnswerTool
7
 
8
- from Gradio_UI import GradioUI
 
 
 
 
 
 
 
 
 
9
 
10
- # Below is an example of a tool that does nothing. Amaze us with your creativity !
11
- @tool
12
- def my_custom_tool(arg1:str, arg2:int)-> str: #it's import to specify the return type
13
- #Keep this format for the description / args / args description but feel free to modify the tool
14
- """A tool that does nothing yet
15
- Args:
16
- arg1: the first argument
17
- arg2: the second argument
18
  """
19
- return "What magic will you build ?"
20
-
21
- @tool
22
- def get_current_time_in_timezone(timezone: str) -> str:
23
- """A tool that fetches the current local time in a specified timezone.
24
- Args:
25
- timezone: A string representing a valid timezone (e.g., 'America/New_York').
26
  """
27
- try:
28
- # Create timezone object
29
- tz = pytz.timezone(timezone)
30
- # Get current time in that timezone
31
- local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
32
- return f"The current local time in {timezone} is: {local_time}"
33
- except Exception as e:
34
- return f"Error fetching time for timezone '{timezone}': {str(e)}"
35
-
36
-
37
- final_answer = FinalAnswerTool()
38
-
39
- # If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
40
- # model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud'
41
-
42
- model = HfApiModel(
43
- max_tokens=2096,
44
- temperature=0.5,
45
- model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
46
- custom_role_conversions=None,
47
- )
48
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- # Import tool from Hub
51
- image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- with open("prompts.yaml", 'r') as stream:
54
- prompt_templates = yaml.safe_load(stream)
55
-
56
- agent = CodeAgent(
57
- model=model,
58
- tools=[final_answer], ## add your tools here (don't remove final answer)
59
- max_steps=6,
60
- verbosity_level=1,
61
- grammar=None,
62
- planning_interval=None,
63
- name=None,
64
- description=None,
65
- prompt_templates=prompt_templates
66
- )
67
 
68
 
69
- GradioUI(agent).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # process_discovery_engine.py
 
 
 
 
 
2
 
3
+ import numpy as np
4
+ import pandas as pd
5
+ from typing import Dict, List, Tuple, Optional
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import spacy
9
+ import json
10
+ import re
11
+ import networkx as nx
12
+ from sklearn.cluster import DBSCAN
13
 
14
+ class ProcessDiscoveryEngine:
 
 
 
 
 
 
 
15
  """
16
+ Discovers and analyzes business processes from various data sources
17
+ including logs, documents, and recorded user activities.
 
 
 
 
 
18
  """
19
+
20
+ def __init__(self, config: Dict):
21
+ """
22
+ Initialize the process discovery engine.
23
+
24
+ Args:
25
+ config: Configuration dictionary with parameters
26
+ """
27
+ self.min_frequency = config.get('min_frequency', 0.05)
28
+ self.time_threshold = config.get('time_threshold', 60) # seconds
29
+ self.similarity_threshold = config.get('similarity_threshold', 0.75)
30
+ self.process_graph = nx.DiGraph()
31
+
32
+ def ingest_log_data(self, log_data: pd.DataFrame) -> bool:
33
+ """
34
+ Ingest process log data from system logs.
35
+
36
+ Args:
37
+ log_data: DataFrame containing log entries with timestamp, user, action columns
38
+
39
+ Returns:
40
+ bool: Success status
41
+ """
42
+ if 'timestamp' not in log_data.columns or 'action' not in log_data.columns:
43
+ return False
44
+
45
+ # Sort by timestamp
46
+ sorted_logs = log_data.sort_values('timestamp')
47
+
48
+ # Group by case_id if available
49
+ if 'case_id' in sorted_logs.columns:
50
+ case_groups = sorted_logs.groupby('case_id')
51
+ for case_id, case_data in case_groups:
52
+ self._process_sequence(case_data['action'].tolist(),
53
+ source=f"log:{case_id}")
54
+ else:
55
+ # Try to identify sessions based on time gaps
56
+ self._segment_and_process_logs(sorted_logs)
57
+
58
+ return True
59
+
60
+ def ingest_screen_recordings(self, recording_analysis: List[Dict]) -> bool:
61
+ """
62
+ Ingest analyzed screen recording data.
63
+
64
+ Args:
65
+ recording_analysis: List of dictionaries containing screen activities
66
+
67
+ Returns:
68
+ bool: Success status
69
+ """
70
+ for session in recording_analysis:
71
+ if 'actions' in session and isinstance(session['actions'], list):
72
+ action_sequence = [a['activity'] for a in session['actions']
73
+ if 'activity' in a]
74
+ self._process_sequence(action_sequence,
75
+ source=f"recording:{session.get('id', 'unknown')}")
76
+
77
+ return True
78
+
79
+ def _segment_and_process_logs(self, logs: pd.DataFrame) -> None:
80
+ """
81
+ Segment logs into probable process instances based on time gaps.
82
+
83
+ Args:
84
+ logs: DataFrame of logs sorted by timestamp
85
+ """
86
+ logs['timestamp'] = pd.to_datetime(logs['timestamp'])
87
+ logs['time_diff'] = logs['timestamp'].diff().dt.total_seconds()
88
+
89
+ # Mark new sequences where time difference exceeds threshold
90
+ new_sequence = logs['time_diff'] > self.time_threshold
91
+ logs['sequence_id'] = new_sequence.cumsum()
92
+
93
+ # Process each sequence
94
+ for seq_id, sequence in logs.groupby('sequence_id'):
95
+ self._process_sequence(sequence['action'].tolist(),
96
+ source=f"timegap:{seq_id}")
97
+
98
+ def _process_sequence(self, actions: List[str], source: str) -> None:
99
+ """
100
+ Process a sequence of actions into the process graph.
101
+
102
+ Args:
103
+ actions: List of action names in sequence
104
+ source: Data source identifier
105
+ """
106
+ for i in range(len(actions) - 1):
107
+ current = actions[i]
108
+ next_action = actions[i+1]
109
+
110
+ # Add nodes if they don't exist
111
+ if current not in self.process_graph:
112
+ self.process_graph.add_node(current, count=0, sources=set())
113
+ if next_action not in self.process_graph:
114
+ self.process_graph.add_node(next_action, count=0, sources=set())
115
+
116
+ # Update node data
117
+ self.process_graph.nodes[current]['count'] += 1
118
+ self.process_graph.nodes[current]['sources'].add(source)
119
+
120
+ # Add or update edge
121
+ if self.process_graph.has_edge(current, next_action):
122
+ self.process_graph[current][next_action]['weight'] += 1
123
+ self.process_graph[current][next_action]['sources'].add(source)
124
+ else:
125
+ self.process_graph.add_edge(current, next_action,
126
+ weight=1, sources={source})
127
+
128
+ def discover_main_process_paths(self) -> List[Dict]:
129
+ """
130
+ Discover the main process paths from the constructed graph.
131
+
132
+ Returns:
133
+ List of dictionaries describing main process paths
134
+ """
135
+ # Filter edges by frequency
136
+ total_transitions = sum(data['weight'] for _, _, data in self.process_graph.edges(data=True))
137
+
138
+ if total_transitions == 0:
139
+ return []
140
+
141
+ min_edge_weight = total_transitions * self.min_frequency
142
+ significant_edges = [(u, v) for u, v, d in self.process_graph.edges(data=True)
143
+ if d['weight'] > min_edge_weight]
144
+
145
+ # Create subgraph with only significant edges
146
+ significant_graph = self.process_graph.edge_subgraph(significant_edges).copy()
147
+
148
+ # Find all simple paths from potential start nodes to end nodes
149
+ start_nodes = [n for n in significant_graph.nodes()
150
+ if significant_graph.in_degree(n) == 0 or
151
+ significant_graph.in_degree(n) < significant_graph.out_degree(n)]
152
+
153
+ end_nodes = [n for n in significant_graph.nodes()
154
+ if significant_graph.out_degree(n) == 0 or
155
+ significant_graph.out_degree(n) < significant_graph.in_degree(n)]
156
+
157
+ # If no clear start/end, use nodes with highest centrality
158
+ if not start_nodes:
159
+ centrality = nx.degree_centrality(significant_graph)
160
+ start_nodes = [max(centrality, key=centrality.get)]
161
+
162
+ if not end_nodes:
163
+ centrality = nx.degree_centrality(significant_graph)
164
+ end_nodes = [max(centrality, key=centrality.get)]
165
+
166
+ # Find all paths between start and end nodes
167
+ all_paths = []
168
+ for start in start_nodes:
169
+ for end in end_nodes:
170
+ try:
171
+ paths = list(nx.all_simple_paths(significant_graph, start, end))
172
+ all_paths.extend(paths)
173
+ except nx.NetworkXNoPath:
174
+ continue
175
+
176
+ # Calculate path frequency and return top paths
177
+ path_data = []
178
+ for path in all_paths:
179
+ # Calculate path strength as minimum edge weight along path
180
+ edge_weights = [significant_graph[path[i]][path[i+1]]['weight']
181
+ for i in range(len(path)-1)]
182
+ path_strength = min(edge_weights) if edge_weights else 0
183
+
184
+ path_data.append({
185
+ 'path': path,
186
+ 'strength': path_strength,
187
+ 'length': len(path),
188
+ 'avg_edge_weight': sum(edge_weights) / len(edge_weights) if edge_weights else 0
189
+ })
190
+
191
+ # Sort by path strength descending
192
+ path_data.sort(key=lambda x: x['strength'], reverse=True)
193
+
194
+ return path_data
195
+
196
+ def identify_process_variants(self) -> List[Dict]:
197
+ """
198
+ Identify variants of the same basic process.
199
+
200
+ Returns:
201
+ List of process variant clusters
202
+ """
203
+ if len(self.process_graph) < 2:
204
+ return []
205
+
206
+ # Extract features for clustering
207
+ paths = self.discover_main_process_paths()
208
+ if not paths:
209
+ return []
210
+
211
+ # Create feature vectors from paths
212
+ all_activities = sorted(list(self.process_graph.nodes()))
213
+ activity_indices = {act: i for i, act in enumerate(all_activities)}
214
+
215
+ # Create feature vectors (activity presence and position)
216
+ feature_vectors = []
217
+ for path_data in paths:
218
+ path = path_data['path']
219
+ vector = np.zeros(len(all_activities) * 2)
220
+
221
+ # Mark presence and relative position of activities
222
+ for pos, activity in enumerate(path):
223
+ idx = activity_indices[activity]
224
+ vector[idx] = 1 # presence
225
+ vector[idx + len(all_activities)] = pos / len(path) # relative position
226
+
227
+ feature_vectors.append(vector)
228
+
229
+ # Cluster paths using DBSCAN
230
+ if len(feature_vectors) < 2:
231
+ return [{'variant_id': 0, 'paths': paths}]
232
+
233
+ clustering = DBSCAN(eps=0.3, min_samples=1).fit(feature_vectors)
234
+ labels = clustering.labels_
235
+
236
+ # Group paths by cluster
237
+ variants = {}
238
+ for i, label in enumerate(labels):
239
+ label_str = str(label)
240
+ if label_str not in variants:
241
+ variants[label_str] = []
242
+ variants[label_str].append(paths[i])
243
+
244
+ # Format result
245
+ result = [
246
+ {'variant_id': variant_id, 'paths': variant_paths}
247
+ for variant_id, variant_paths in variants.items()
248
+ ]
249
+
250
+ return result
251
+
252
+ def get_process_stats(self) -> Dict:
253
+ """
254
+ Get statistics about the discovered process.
255
+
256
+ Returns:
257
+ Dictionary with process statistics
258
+ """
259
+ if not self.process_graph:
260
+ return {"error": "No process data available"}
261
+
262
+ stats = {
263
+ "num_activities": len(self.process_graph.nodes()),
264
+ "num_transitions": len(self.process_graph.edges()),
265
+ "most_frequent_activities": [],
266
+ "most_frequent_transitions": [],
267
+ "process_complexity": 0,
268
+ "data_sources": set()
269
+ }
270
+
271
+ # Most frequent activities
272
+ activities = [(node, data['count'])
273
+ for node, data in self.process_graph.nodes(data=True)]
274
+ activities.sort(key=lambda x: x[1], reverse=True)
275
+ stats["most_frequent_activities"] = activities[:10]
276
+
277
+ # Most frequent transitions
278
+ transitions = [(u, v, data['weight'])
279
+ for u, v, data in self.process_graph.edges(data=True)]
280
+ transitions.sort(key=lambda x: x[2], reverse=True)
281
+ stats["most_frequent_transitions"] = transitions[:10]
282
+
283
+ # Process complexity (using Control-Flow Complexity metric)
284
+ stats["process_complexity"] = sum(self.process_graph.out_degree(n) for n in self.process_graph.nodes())
285
+
286
+ # Data sources
287
+ for _, data in self.process_graph.nodes(data=True):
288
+ if 'sources' in data:
289
+ stats["data_sources"].update(data['sources'])
290
+
291
+ stats["data_sources"] = list(stats["data_sources"])
292
+
293
+ return stats
294
 
295
+ def export_process_model(self, format_type: str = 'bpmn') -> Dict:
296
+ """
297
+ Export the discovered process in the specified format.
298
+
299
+ Args:
300
+ format_type: Output format ('bpmn', 'petri_net', or 'json')
301
+
302
+ Returns:
303
+ Dictionary with export data and metadata
304
+ """
305
+ if format_type == 'json':
306
+ nodes = [{"id": n, "count": data.get('count', 0)}
307
+ for n, data in self.process_graph.nodes(data=True)]
308
+
309
+ edges = [{"source": u, "target": v, "weight": data.get('weight', 0)}
310
+ for u, v, data in self.process_graph.edges(data=True)]
311
+
312
+ return {
313
+ "format": "json",
314
+ "process_model": {
315
+ "nodes": nodes,
316
+ "edges": edges
317
+ }
318
+ }
319
+
320
+ elif format_type == 'bpmn':
321
+ # Basic BPMN conversion (simplified)
322
+ # In a real implementation, this would generate actual BPMN XML
323
+ return {
324
+ "format": "bpmn",
325
+ "process_model": {
326
+ "process_id": "discovered_process",
327
+ "activities": list(self.process_graph.nodes()),
328
+ "flows": [(u, v) for u, v in self.process_graph.edges()],
329
+ "gateways": self._identify_potential_gateways()
330
+ }
331
+ }
332
+
333
+ elif format_type == 'petri_net':
334
+ # Basic Petri net conversion (simplified)
335
+ return {
336
+ "format": "petri_net",
337
+ "process_model": {
338
+ "places": self._generate_petri_net_places(),
339
+ "transitions": list(self.process_graph.nodes()),
340
+ "arcs": self._generate_petri_net_arcs()
341
+ }
342
+ }
343
+
344
+ else:
345
+ return {"error": f"Unsupported export format: {format_type}"}
346
+
347
+ def _identify_potential_gateways(self) -> List[Dict]:
348
+ """
349
+ Identify potential gateways in the process based on branching.
350
+
351
+ Returns:
352
+ List of potential gateway nodes
353
+ """
354
+ gateways = []
355
+
356
+ for node in self.process_graph.nodes():
357
+ in_degree = self.process_graph.in_degree(node)
358
+ out_degree = self.process_graph.out_degree(node)
359
+
360
+ # Potential XOR-split (one input, multiple outputs)
361
+ if in_degree == 1 and out_degree > 1:
362
+ gateways.append({
363
+ "id": f"xor_split_{node}",
364
+ "type": "exclusive_gateway",
365
+ "direction": "split",
366
+ "attached_to": node
367
+ })
368
+
369
+ # Potential XOR-join (multiple inputs, one output)
370
+ elif in_degree > 1 and out_degree == 1:
371
+ gateways.append({
372
+ "id": f"xor_join_{node}",
373
+ "type": "exclusive_gateway",
374
+ "direction": "join",
375
+ "attached_to": node
376
+ })
377
+
378
+ # Potential AND-split/join or complex gateway
379
+ elif in_degree > 1 and out_degree > 1:
380
+ gateways.append({
381
+ "id": f"complex_{node}",
382
+ "type": "complex_gateway",
383
+ "direction": "mixed",
384
+ "attached_to": node
385
+ })
386
+
387
+ return gateways
388
+
389
+ def _generate_petri_net_places(self) -> List[str]:
390
+ """
391
+ Generate places for a Petri net representation.
392
+
393
+ Returns:
394
+ List of place IDs
395
+ """
396
+ places = []
397
+
398
+ # Generate places between each pair of activities
399
+ for u, v in self.process_graph.edges():
400
+ places.append(f"p_{u}_{v}")
401
+
402
+ # Add start and end places
403
+ start_nodes = [n for n in self.process_graph.nodes()
404
+ if self.process_graph.in_degree(n) == 0]
405
+ for node in start_nodes:
406
+ places.append(f"p_start_{node}")
407
+
408
+ end_nodes = [n for n in self.process_graph.nodes()
409
+ if self.process_graph.out_degree(n) == 0]
410
+ for node in end_nodes:
411
+ places.append(f"p_{node}_end")
412
+
413
+ return places
414
+
415
+ def _generate_petri_net_arcs(self) -> List[Tuple[str, str]]:
416
+ """
417
+ Generate arcs for a Petri net representation.
418
+
419
+ Returns:
420
+ List of (source, target) tuples representing arcs
421
+ """
422
+ arcs = []
423
+
424
+ # Connect transitions through places
425
+ for u, v in self.process_graph.edges():
426
+ place = f"p_{u}_{v}"
427
+ arcs.append((u, place))
428
+ arcs.append((place, v))
429
+
430
+ # Connect start places to initial transitions
431
+ start_nodes = [n for n in self.process_graph.nodes()
432
+ if self.process_graph.in_degree(n) == 0]
433
+ for node in start_nodes:
434
+ arcs.append((f"p_start_{node}", node))
435
+
436
+ # Connect final transitions to end places
437
+ end_nodes = [n for n in self.process_graph.nodes()
438
+ if self.process_graph.out_degree(n) == 0]
439
+ for node in end_nodes:
440
+ arcs.append((node, f"p_{node}_end"))
441
+
442
+ return arcs
443
 
444
+ # requirements_analysis_module.py
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
 
447
+ class RequirementsAnalysisModule:
448
+ """
449
+ Analyzes business requirements and connects them to processes.
450
+ Extracts structured data from natural language requirements.
451
+ """
452
+
453
+ def __init__(self, config: Dict = None):
454
+ """
455
+ Initialize the requirements analysis module.
456
+
457
+ Args:
458
+ config: Configuration dictionary
459
+ """
460
+ self.config = config or {}
461
+
462
+ # Load NLP model
463
+ try:
464
+ self.nlp = spacy.load("en_core_web_md")
465
+ except:
466
+ # Fallback to small model if medium not available
467
+ self.nlp = spacy.load("en_core_web_sm")
468
+
469
+ # Initialize requirements storage
470
+ self.requirements = []
471
+
472
+ # Initialize taxonomy and patterns
473
+ self._load_taxonomies()
474
+ self._compile_requirement_patterns()
475
+
476
+ def _load_taxonomies(self) -> None:
477
+ """Load or initialize the business process taxonomy."""
478
+ # In production, this would load from a file or database
479
+ self.process_taxonomy = {
480
+ "financial": [
481
+ "invoice processing", "accounts payable", "accounts receivable",
482
+ "payment processing", "financial reporting", "expense management"
483
+ ],
484
+ "hr": [
485
+ "onboarding", "offboarding", "payroll", "recruitment",
486
+ "employee management", "benefits administration", "time tracking"
487
+ ],
488
+ "customer_service": [
489
+ "ticket management", "customer support", "inquiry handling",
490
+ "complaint resolution", "feedback processing"
491
+ ],
492
+ "operations": [
493
+ "inventory management", "supply chain", "logistics",
494
+ "order processing", "shipping", "receiving", "quality control"
495
+ ],
496
+ "sales": [
497
+ "lead management", "opportunity tracking", "quote generation",
498
+ "contract management", "sales reporting", "commission calculation"
499
+ ],
500
+ "it": [
501
+ "access management", "incident management", "change management",
502
+ "service request", "problem management", "release management"
503
+ ]
504
+ }
505
+
506
+ # Complexity indicators for requirements
507
+ self.complexity_indicators = {
508
+ "high": [
509
+ "complex", "multiple systems", "integration", "decision tree",
510
+ "exception handling", "compliance", "regulatory", "manual review",
511
+ "approval workflow", "conditional logic", "business rules"
512
+ ],
513
+ "medium": [
514
+ "validation", "verification", "notification", "alert",
515
+ "scheduled", "reporting", "dashboard", "data transformation"
516
+ ],
517
+ "low": [
518
+ "simple", "straightforward", "data entry", "form filling",
519
+ "standard", "single system", "fixed path", "static rules"
520
+ ]
521
+ }
522
+
523
+ def _compile_requirement_patterns(self) -> None:
524
+ """Compile regex patterns for requirement extraction."""
525
+ # Action patterns
526
+ self.action_patterns = [
527
+ r"(?:need|should|must|will|shall) (?:to )?([a-z]+)",
528
+ r"responsible for ([a-z]+ing)",
529
+ r"capability to ([a-z]+)",
530
+ r"ability to ([a-z]+)"
531
+ ]
532
+
533
+ # System patterns
534
+ self.system_patterns = [
535
+ r"(?:in|from|to|using|within) (?:the )?([A-Za-z0-9]+)(?: system| application| platform| software| tool)?",
536
+ r"([A-Za-z0-9]+)(?: system| application| platform| software| tool)",
537
+ r"([A-Za-z0-9]+) (?:database|interface|API|server)"
538
+ ]
539
+
540
+ # Frequency patterns
541
+ self.frequency_patterns = [
542
+ r"(daily|weekly|monthly|quarterly|yearly|annually)",
543
+ r"every ([0-9]+) (day|week|month|quarter|year)s?",
544
+ r"([0-9]+) times per (day|week|month|year)"
545
+ ]
546
+
547
+ # Compile all patterns
548
+ self.action_regex = [re.compile(pattern) for pattern in self.action_patterns]
549
+ self.system_regex = [re.compile(pattern) for pattern in self.system_patterns]
550
+ self.frequency_regex = [re.compile(pattern) for pattern in self.frequency_patterns]
551
+
552
+ def analyze_text_requirement(self, requirement_text: str, source: str = None) -> Dict:
553
+ """
554
+ Analyze a natural language requirement and extract structured information.
555
+
556
+ Args:
557
+ requirement_text: The text of the requirement
558
+ source: Source of the requirement
559
+
560
+ Returns:
561
+ Dictionary with extracted requirement information
562
+ """
563
+ # Parse with spaCy
564
+ doc = self.nlp(requirement_text)
565
+
566
+ # Basic requirement object
567
+ requirement = {
568
+ "id": f"REQ-{len(self.requirements) + 1}",
569
+ "text": requirement_text,
570
+ "source": source,
571
+ "extracted": {
572
+ "actions": self._extract_actions(doc, requirement_text),
573
+ "systems": self._extract_systems(doc, requirement_text),
574
+ "frequency": self._extract_frequency(requirement_text),
575
+ "business_domain": self._classify_business_domain(doc),
576
+ "complexity": self._assess_complexity(doc, requirement_text),
577
+ "data_elements": self._extract_data_elements(doc)
578
+ },
579
+ "automation_potential": None # Will be filled later
580
+ }
581
+
582
+ # Store the requirement
583
+ self.requirements.append(requirement)
584
+ return requirement
585
+
586
+ def _extract_actions(self, doc, text: str) -> List[str]:
587
+ """
588
+ Extract action verbs from requirement text.
589
+
590
+ Args:
591
+ doc: spaCy processed document
592
+ text: Original text
593
+
594
+ Returns:
595
+ List of action verbs
596
+ """
597
+ # Method 1: Use spaCy to find verbs
598
+ verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
599
+
600
+ # Method 2: Use regex patterns
601
+ pattern_matches = []
602
+ for pattern in self.action_regex:
603
+ matches = pattern.findall(text.lower())
604
+ pattern_matches.extend(matches)
605
+
606
+ # Combine and deduplicate
607
+ all_actions = list(set(verbs + pattern_matches))
608
+
609
+ # Filter out common non-action verbs
610
+ stopwords = ["be", "is", "are", "was", "were", "have", "has", "had"]
611
+ filtered_actions = [v for v in all_actions if v not in stopwords and len(v) > 2]
612
+
613
+ return filtered_actions
614
+
615
+ def _extract_systems(self, doc, text: str) -> List[str]:
616
+ """
617
+ Extract system names from requirement text.
618
+
619
+ Args:
620
+ doc: spaCy processed document
621
+ text: Original text
622
+
623
+ Returns:
624
+ List of system names
625
+ """
626
+ # Method 1: Named Entity Recognition for PRODUCT entities
627
+ ner_systems = [ent.text for ent in doc.ents
628
+ if ent.label_ in ["PRODUCT", "ORG", "GPE"]]
629
+
630
+ # Method 2: Pattern matching
631
+ pattern_systems = []
632
+ for pattern in self.system_regex:
633
+ matches = pattern.findall(text)
634
+ pattern_systems.extend(matches)
635
+
636
+ # Combine results
637
+ all_systems = list(set(ner_systems + pattern_systems))
638
+
639
+ # Filter out common false positives
640
+ stopwords = ["system", "process", "application", "data", "information", "this", "the"]
641
+ filtered_systems = [s for s in all_systems if s.lower() not in stopwords and len(s) > 2]
642
+
643
+ return filtered_systems
644
+
645
+ def _extract_frequency(self, text: str) -> Optional[str]:
646
+ """
647
+ Extract frequency information from requirement text.
648
+
649
+ Args:
650
+ text: Requirement text
651
+
652
+ Returns:
653
+ Extracted frequency or None
654
+ """
655
+ text_lower = text.lower()
656
+
657
+ # Check all frequency patterns
658
+ for pattern in self.frequency_regex:
659
+ match = pattern.search(text_lower)
660
+ if match:
661
+ return match.group(0)
662
+
663
+ # Check for specific frequency words
664
+ frequency_words = ["daily", "weekly", "monthly", "quarterly", "annually", "yearly"]
665
+ for word in frequency_words:
666
+ if word in text_lower:
667
+ return word
668
+
669
+ return None
670
+
671
+ def _classify_business_domain(self, doc) -> List[Tuple[str, float]]:
672
+ """
673
+ Classify the business domain of the requirement.
674
+
675
+ Args:
676
+ doc: spaCy processed document
677
+
678
+ Returns:
679
+ List of (domain, confidence) tuples
680
+ """
681
+ text = doc.text.lower()
682
+ domain_scores = {}
683
+
684
+ # Calculate score for each domain based on keyword matches
685
+ for domain, keywords in self.process_taxonomy.items():
686
+ domain_score = 0
687
+ for keyword in keywords:
688
+ if keyword in text:
689
+ domain_score += 1
690
+
691
+ if domain_score > 0:
692
+ # Normalize by number of keywords
693
+ domain_scores[domain] = domain_score / len(keywords)
694
+
695
+ # If no direct matches, use semantic similarity
696
+ if not domain_scores:
697
+ for domain, keywords in self.process_taxonomy.items():
698
+ # Calculate average similarity between doc and each keyword
699
+ similarities = [doc.similarity(self.nlp(keyword)) for keyword in keywords]
700
+ avg_similarity = sum(similarities) / len(similarities) if similarities else 0
701
+
702
+ if avg_similarity > 0.5: # Threshold for relevance
703
+ domain_scores[domain] = avg_similarity
704
+
705
+ # Sort by score and return
706
+ sorted_domains = sorted(domain_scores.items(), key=lambda x: x[1], reverse=True)
707
+ return sorted_domains
708
+
709
+ def _assess_complexity(self, doc, text: str) -> str:
710
+ """
711
+ Assess the complexity of the requirement.
712
+
713
+ Args:
714
+ doc: spaCy processed document
715
+ text: Original text
716
+
717
+ Returns:
718
+ Complexity level ("high", "medium", or "low")
719
+ """
720
+ text_lower = text.lower()
721
+
722
+ # Count indicators for each complexity level
723
+ scores = {level: 0 for level in self.complexity_indicators.keys()}
724
+
725
+ for level, indicators in self.complexity_indicators.items():
726
+ for indicator in indicators:
727
+ if indicator in text_lower:
728
+ scores[level] += 1
729
+
730
+ # Check sentence structure complexity
731
+ sentence_count = len(list(doc.sents))
732
+ avg_tokens_per_sentence = len(doc) / sentence_count if sentence_count > 0 else 0
733
+
734
+ # Adjust scores based on structural complexity
735
+ if avg_tokens_per_sentence > 25:
736
+ scores["high"] += 1
737
+ elif avg_tokens_per_sentence > 15:
738
+ scores["medium"] += 1
739
+
740
+ # Check for conditional statements (if/then)
741
+ if "if" in text_lower and ("then" in text_lower or "else" in text_lower):
742
+ scores["high"] += 1
743
+
744
+ # Determine final complexity
745
+ if scores["high"] > 0:
746
+ return "high"
747
+ elif scores["medium"] > 0:
748
+ return "medium"
749
+ else:
750
+ return "low"
751
+
752
+ def _extract_data_elements(self, doc) -> List[str]:
753
+ """
754
+ Extract data elements from the requirement text.
755
+
756
+ Args:
757
+ doc: spaCy processed document
758
+
759
+ Returns:
760
+ List of data elements
761
+ """
762
+ # Find noun chunks that could be data elements
763
+ data_elements = []
764
+
765
+ for chunk in doc.noun_chunks:
766
+ # Check if this looks like a data field
767
+ if (any(token.pos_ == "NOUN" for token in chunk) and
768
+ len(chunk) <= 4 and # Not too long
769
+ not any(token.is_stop for token in chunk)): # Not all stopwords
770
+ data_elements.append(chunk.text)
771
+
772
+ # Look for specific data patterns
773
+ data_patterns = [
774
+ (r"\b[A-Z][a-z]+ ID\b", "ID field"),
775
+ (r"\b[A-Z][a-z]+ Number\b", "Number field"),
776
+ (r"\b[A-Z][a-z]+ Code\b", "Code field"),
777
+ (r"\b[A-Z][a-z]+ Date\b", "Date field"),
778
+ (r"\bstatus\b", "Status field")
779
+ ]
780
+
781
+ for pattern, field_type in data_patterns:
782
+ if re.search(pattern, doc.text):
783
+ data_elements.append(field_type)
784
+
785
+ return list(set(data_elements))
786
+
787
+ def analyze_requirements_batch(self, requirements: List[Dict]) -> List[Dict]:
788
+ """
789
+ Analyze a batch of requirements and find relationships between them.
790
+
791
+ Args:
792
+ requirements: List of requirement dictionaries with 'text' field
793
+
794
+ Returns:
795
+ List of analyzed requirements
796
+ """
797
+ # Process each requirement
798
+ processed_requirements = []
799
+ for req in requirements:
800
+ req_text = req.get('text', '')
801
+ source = req.get('source', 'batch')
802
+ processed = self.analyze_text_requirement(req_text, source)
803
+ processed_requirements.append(processed)
804
+
805
+ # Find relationships between requirements
806
+ self._find_requirement_relationships(processed_requirements)
807
+
808
+ return processed_requirements
809
+
810
+ def _find_requirement_relationships(self, requirements: List[Dict]) -> None:
811
+ """
812
+ Find and add relationships between requirements.
813
+
814
+ Args:
815
+ requirements: List of processed requirements
816
+ """
817
+ if len(requirements) < 2:
818
+ return
819
+
820
+ # Extract text from requirements
821
+ texts = [req["text"] for req in requirements]
822
+
823
+ # Create TF-IDF matrix
824
+ vectorizer = TfidfVectorizer(stop_words='english')
825
+ tfidf_matrix = vectorizer.fit_transform(texts)
826
+
827
+ # Calculate similarity matrix
828
+ similarity_matrix = cosine_similarity(tfidf_matrix)
829
+
830
+ # Add relationships to requirements
831
+ for i, req in enumerate(requirements):
832
+ related = []
833
+
834
+ for j