First_agent_template

Sleeping

App Files Files Community

USER-GNEXUSES commited on Feb 19, 2025

Commit

e7597b4

verified ·

1 Parent(s): 22a8e59

Update app.py

Browse files

Files changed (1) hide show

app.py +826 -61

app.py CHANGED Viewed

@@ -1,69 +1,834 @@
-from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
-import datetime
-import requests
-import pytz
-import yaml
-from tools.final_answer import FinalAnswerTool
-from Gradio_UI import GradioUI
-# Below is an example of a tool that does nothing. Amaze us with your creativity !
-@tool
-def my_custom_tool(arg1:str, arg2:int)-> str: #it's import to specify the return type
-    #Keep this format for the description / args / args description but feel free to modify the tool
-    """A tool that does nothing yet
-    Args:
-        arg1: the first argument
-        arg2: the second argument
     """
-    return "What magic will you build ?"
-@tool
-def get_current_time_in_timezone(timezone: str) -> str:
-    """A tool that fetches the current local time in a specified timezone.
-    Args:
-        timezone: A string representing a valid timezone (e.g., 'America/New_York').
     """
-    try:
-        # Create timezone object
-        tz = pytz.timezone(timezone)
-        # Get current time in that timezone
-        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
-        return f"The current local time in {timezone} is: {local_time}"
-    except Exception as e:
-        return f"Error fetching time for timezone '{timezone}': {str(e)}"
-final_answer = FinalAnswerTool()
-# If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
-# model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud'
-model = HfApiModel(
-max_tokens=2096,
-temperature=0.5,
-model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
-custom_role_conversions=None,
-)
-# Import tool from Hub
-image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
-with open("prompts.yaml", 'r') as stream:
-    prompt_templates = yaml.safe_load(stream)
-agent = CodeAgent(
-    model=model,
-    tools=[final_answer], ## add your tools here (don't remove final answer)
-    max_steps=6,
-    verbosity_level=1,
-    grammar=None,
-    planning_interval=None,
-    name=None,
-    description=None,
-    prompt_templates=prompt_templates
-)
-GradioUI(agent).launch()

+# process_discovery_engine.py
+import numpy as np
+import pandas as pd
+from typing import Dict, List, Tuple, Optional
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import spacy
+import json
+import re
+import networkx as nx
+from sklearn.cluster import DBSCAN
+class ProcessDiscoveryEngine:
     """
+    Discovers and analyzes business processes from various data sources
+    including logs, documents, and recorded user activities.
     """
+    def __init__(self, config: Dict):
+        """
+        Initialize the process discovery engine.
+        Args:
+            config: Configuration dictionary with parameters
+        """
+        self.min_frequency = config.get('min_frequency', 0.05)
+        self.time_threshold = config.get('time_threshold', 60)  # seconds
+        self.similarity_threshold = config.get('similarity_threshold', 0.75)
+        self.process_graph = nx.DiGraph()
+    def ingest_log_data(self, log_data: pd.DataFrame) -> bool:
+        """
+        Ingest process log data from system logs.
+        Args:
+            log_data: DataFrame containing log entries with timestamp, user, action columns
+        Returns:
+            bool: Success status
+        """
+        if 'timestamp' not in log_data.columns or 'action' not in log_data.columns:
+            return False
+        # Sort by timestamp
+        sorted_logs = log_data.sort_values('timestamp')
+        # Group by case_id if available
+        if 'case_id' in sorted_logs.columns:
+            case_groups = sorted_logs.groupby('case_id')
+            for case_id, case_data in case_groups:
+                self._process_sequence(case_data['action'].tolist(),
+                                      source=f"log:{case_id}")
+        else:
+            # Try to identify sessions based on time gaps
+            self._segment_and_process_logs(sorted_logs)
+        return True
+    def ingest_screen_recordings(self, recording_analysis: List[Dict]) -> bool:
+        """
+        Ingest analyzed screen recording data.
+        Args:
+            recording_analysis: List of dictionaries containing screen activities
+        Returns:
+            bool: Success status
+        """
+        for session in recording_analysis:
+            if 'actions' in session and isinstance(session['actions'], list):
+                action_sequence = [a['activity'] for a in session['actions']
+                                  if 'activity' in a]
+                self._process_sequence(action_sequence,
+                                     source=f"recording:{session.get('id', 'unknown')}")
+        return True
+    def _segment_and_process_logs(self, logs: pd.DataFrame) -> None:
+        """
+        Segment logs into probable process instances based on time gaps.
+        Args:
+            logs: DataFrame of logs sorted by timestamp
+        """
+        logs['timestamp'] = pd.to_datetime(logs['timestamp'])
+        logs['time_diff'] = logs['timestamp'].diff().dt.total_seconds()
+        # Mark new sequences where time difference exceeds threshold
+        new_sequence = logs['time_diff'] > self.time_threshold
+        logs['sequence_id'] = new_sequence.cumsum()
+        # Process each sequence
+        for seq_id, sequence in logs.groupby('sequence_id'):
+            self._process_sequence(sequence['action'].tolist(),
+                                  source=f"timegap:{seq_id}")
+    def _process_sequence(self, actions: List[str], source: str) -> None:
+        """
+        Process a sequence of actions into the process graph.
+        Args:
+            actions: List of action names in sequence
+            source: Data source identifier
+        """
+        for i in range(len(actions) - 1):
+            current = actions[i]
+            next_action = actions[i+1]
+            # Add nodes if they don't exist
+            if current not in self.process_graph:
+                self.process_graph.add_node(current, count=0, sources=set())
+            if next_action not in self.process_graph:
+                self.process_graph.add_node(next_action, count=0, sources=set())
+            # Update node data
+            self.process_graph.nodes[current]['count'] += 1
+            self.process_graph.nodes[current]['sources'].add(source)
+            # Add or update edge
+            if self.process_graph.has_edge(current, next_action):
+                self.process_graph[current][next_action]['weight'] += 1
+                self.process_graph[current][next_action]['sources'].add(source)
+            else:
+                self.process_graph.add_edge(current, next_action,
+                                           weight=1, sources={source})
+    def discover_main_process_paths(self) -> List[Dict]:
+        """
+        Discover the main process paths from the constructed graph.
+        Returns:
+            List of dictionaries describing main process paths
+        """
+        # Filter edges by frequency
+        total_transitions = sum(data['weight'] for _, _, data in self.process_graph.edges(data=True))
+        if total_transitions == 0:
+            return []
+        min_edge_weight = total_transitions * self.min_frequency
+        significant_edges = [(u, v) for u, v, d in self.process_graph.edges(data=True)
+                            if d['weight'] > min_edge_weight]
+        # Create subgraph with only significant edges
+        significant_graph = self.process_graph.edge_subgraph(significant_edges).copy()
+        # Find all simple paths from potential start nodes to end nodes
+        start_nodes = [n for n in significant_graph.nodes()
+                      if significant_graph.in_degree(n) == 0 or
+                      significant_graph.in_degree(n) < significant_graph.out_degree(n)]
+        end_nodes = [n for n in significant_graph.nodes()
+                    if significant_graph.out_degree(n) == 0 or
+                    significant_graph.out_degree(n) < significant_graph.in_degree(n)]
+        # If no clear start/end, use nodes with highest centrality
+        if not start_nodes:
+            centrality = nx.degree_centrality(significant_graph)
+            start_nodes = [max(centrality, key=centrality.get)]
+        if not end_nodes:
+            centrality = nx.degree_centrality(significant_graph)
+            end_nodes = [max(centrality, key=centrality.get)]
+        # Find all paths between start and end nodes
+        all_paths = []
+        for start in start_nodes:
+            for end in end_nodes:
+                try:
+                    paths = list(nx.all_simple_paths(significant_graph, start, end))
+                    all_paths.extend(paths)
+                except nx.NetworkXNoPath:
+                    continue
+        # Calculate path frequency and return top paths
+        path_data = []
+        for path in all_paths:
+            # Calculate path strength as minimum edge weight along path
+            edge_weights = [significant_graph[path[i]][path[i+1]]['weight']
+                          for i in range(len(path)-1)]
+            path_strength = min(edge_weights) if edge_weights else 0
+            path_data.append({
+                'path': path,
+                'strength': path_strength,
+                'length': len(path),
+                'avg_edge_weight': sum(edge_weights) / len(edge_weights) if edge_weights else 0
+            })
+        # Sort by path strength descending
+        path_data.sort(key=lambda x: x['strength'], reverse=True)
+        return path_data
+    def identify_process_variants(self) -> List[Dict]:
+        """
+        Identify variants of the same basic process.
+        Returns:
+            List of process variant clusters
+        """
+        if len(self.process_graph) < 2:
+            return []
+        # Extract features for clustering
+        paths = self.discover_main_process_paths()
+        if not paths:
+            return []
+        # Create feature vectors from paths
+        all_activities = sorted(list(self.process_graph.nodes()))
+        activity_indices = {act: i for i, act in enumerate(all_activities)}
+        # Create feature vectors (activity presence and position)
+        feature_vectors = []
+        for path_data in paths:
+            path = path_data['path']
+            vector = np.zeros(len(all_activities) * 2)
+            # Mark presence and relative position of activities
+            for pos, activity in enumerate(path):
+                idx = activity_indices[activity]
+                vector[idx] = 1  # presence
+                vector[idx + len(all_activities)] = pos / len(path)  # relative position
+            feature_vectors.append(vector)
+        # Cluster paths using DBSCAN
+        if len(feature_vectors) < 2:
+            return [{'variant_id': 0, 'paths': paths}]
+        clustering = DBSCAN(eps=0.3, min_samples=1).fit(feature_vectors)
+        labels = clustering.labels_
+        # Group paths by cluster
+        variants = {}
+        for i, label in enumerate(labels):
+            label_str = str(label)
+            if label_str not in variants:
+                variants[label_str] = []
+            variants[label_str].append(paths[i])
+        # Format result
+        result = [
+            {'variant_id': variant_id, 'paths': variant_paths}
+            for variant_id, variant_paths in variants.items()
+        ]
+        return result
+    def get_process_stats(self) -> Dict:
+        """
+        Get statistics about the discovered process.
+        Returns:
+            Dictionary with process statistics
+        """
+        if not self.process_graph:
+            return {"error": "No process data available"}
+        stats = {
+            "num_activities": len(self.process_graph.nodes()),
+            "num_transitions": len(self.process_graph.edges()),
+            "most_frequent_activities": [],
+            "most_frequent_transitions": [],
+            "process_complexity": 0,
+            "data_sources": set()
+        }
+        # Most frequent activities
+        activities = [(node, data['count'])
+                     for node, data in self.process_graph.nodes(data=True)]
+        activities.sort(key=lambda x: x[1], reverse=True)
+        stats["most_frequent_activities"] = activities[:10]
+        # Most frequent transitions
+        transitions = [(u, v, data['weight'])
+                      for u, v, data in self.process_graph.edges(data=True)]
+        transitions.sort(key=lambda x: x[2], reverse=True)
+        stats["most_frequent_transitions"] = transitions[:10]
+        # Process complexity (using Control-Flow Complexity metric)
+        stats["process_complexity"] = sum(self.process_graph.out_degree(n) for n in self.process_graph.nodes())
+        # Data sources
+        for _, data in self.process_graph.nodes(data=True):
+            if 'sources' in data:
+                stats["data_sources"].update(data['sources'])
+        stats["data_sources"] = list(stats["data_sources"])
+        return stats
+    def export_process_model(self, format_type: str = 'bpmn') -> Dict:
+        """
+        Export the discovered process in the specified format.
+        Args:
+            format_type: Output format ('bpmn', 'petri_net', or 'json')
+        Returns:
+            Dictionary with export data and metadata
+        """
+        if format_type == 'json':
+            nodes = [{"id": n, "count": data.get('count', 0)}
+                    for n, data in self.process_graph.nodes(data=True)]
+            edges = [{"source": u, "target": v, "weight": data.get('weight', 0)}
+                    for u, v, data in self.process_graph.edges(data=True)]
+            return {
+                "format": "json",
+                "process_model": {
+                    "nodes": nodes,
+                    "edges": edges
+                }
+            }
+        elif format_type == 'bpmn':
+            # Basic BPMN conversion (simplified)
+            # In a real implementation, this would generate actual BPMN XML
+            return {
+                "format": "bpmn",
+                "process_model": {
+                    "process_id": "discovered_process",
+                    "activities": list(self.process_graph.nodes()),
+                    "flows": [(u, v) for u, v in self.process_graph.edges()],
+                    "gateways": self._identify_potential_gateways()
+                }
+            }
+        elif format_type == 'petri_net':
+            # Basic Petri net conversion (simplified)
+            return {
+                "format": "petri_net",
+                "process_model": {
+                    "places": self._generate_petri_net_places(),
+                    "transitions": list(self.process_graph.nodes()),
+                    "arcs": self._generate_petri_net_arcs()
+                }
+            }
+        else:
+            return {"error": f"Unsupported export format: {format_type}"}
+    def _identify_potential_gateways(self) -> List[Dict]:
+        """
+        Identify potential gateways in the process based on branching.
+        Returns:
+            List of potential gateway nodes
+        """
+        gateways = []
+        for node in self.process_graph.nodes():
+            in_degree = self.process_graph.in_degree(node)
+            out_degree = self.process_graph.out_degree(node)
+            # Potential XOR-split (one input, multiple outputs)
+            if in_degree == 1 and out_degree > 1:
+                gateways.append({
+                    "id": f"xor_split_{node}",
+                    "type": "exclusive_gateway",
+                    "direction": "split",
+                    "attached_to": node
+                })
+            # Potential XOR-join (multiple inputs, one output)
+            elif in_degree > 1 and out_degree == 1:
+                gateways.append({
+                    "id": f"xor_join_{node}",
+                    "type": "exclusive_gateway",
+                    "direction": "join",
+                    "attached_to": node
+                })
+            # Potential AND-split/join or complex gateway
+            elif in_degree > 1 and out_degree > 1:
+                gateways.append({
+                    "id": f"complex_{node}",
+                    "type": "complex_gateway",
+                    "direction": "mixed",
+                    "attached_to": node
+                })
+        return gateways
+    def _generate_petri_net_places(self) -> List[str]:
+        """
+        Generate places for a Petri net representation.
+        Returns:
+            List of place IDs
+        """
+        places = []
+        # Generate places between each pair of activities
+        for u, v in self.process_graph.edges():
+            places.append(f"p_{u}_{v}")
+        # Add start and end places
+        start_nodes = [n for n in self.process_graph.nodes()
+                      if self.process_graph.in_degree(n) == 0]
+        for node in start_nodes:
+            places.append(f"p_start_{node}")
+        end_nodes = [n for n in self.process_graph.nodes()
+                    if self.process_graph.out_degree(n) == 0]
+        for node in end_nodes:
+            places.append(f"p_{node}_end")
+        return places
+    def _generate_petri_net_arcs(self) -> List[Tuple[str, str]]:
+        """
+        Generate arcs for a Petri net representation.
+        Returns:
+            List of (source, target) tuples representing arcs
+        """
+        arcs = []
+        # Connect transitions through places
+        for u, v in self.process_graph.edges():
+            place = f"p_{u}_{v}"
+            arcs.append((u, place))
+            arcs.append((place, v))
+        # Connect start places to initial transitions
+        start_nodes = [n for n in self.process_graph.nodes()
+                      if self.process_graph.in_degree(n) == 0]
+        for node in start_nodes:
+            arcs.append((f"p_start_{node}", node))
+        # Connect final transitions to end places
+        end_nodes = [n for n in self.process_graph.nodes()
+                    if self.process_graph.out_degree(n) == 0]
+        for node in end_nodes:
+            arcs.append((node, f"p_{node}_end"))
+        return arcs
+# requirements_analysis_module.py
+class RequirementsAnalysisModule:
+    """
+    Analyzes business requirements and connects them to processes.
+    Extracts structured data from natural language requirements.
+    """
+    def __init__(self, config: Dict = None):
+        """
+        Initialize the requirements analysis module.
+        Args:
+            config: Configuration dictionary
+        """
+        self.config = config or {}
+        # Load NLP model
+        try:
+            self.nlp = spacy.load("en_core_web_md")
+        except:
+            # Fallback to small model if medium not available
+            self.nlp = spacy.load("en_core_web_sm")
+        # Initialize requirements storage
+        self.requirements = []
+        # Initialize taxonomy and patterns
+        self._load_taxonomies()
+        self._compile_requirement_patterns()
+    def _load_taxonomies(self) -> None:
+        """Load or initialize the business process taxonomy."""
+        # In production, this would load from a file or database
+        self.process_taxonomy = {
+            "financial": [
+                "invoice processing", "accounts payable", "accounts receivable",
+                "payment processing", "financial reporting", "expense management"
+            ],
+            "hr": [
+                "onboarding", "offboarding", "payroll", "recruitment",
+                "employee management", "benefits administration", "time tracking"
+            ],
+            "customer_service": [
+                "ticket management", "customer support", "inquiry handling",
+                "complaint resolution", "feedback processing"
+            ],
+            "operations": [
+                "inventory management", "supply chain", "logistics",
+                "order processing", "shipping", "receiving", "quality control"
+            ],
+            "sales": [
+                "lead management", "opportunity tracking", "quote generation",
+                "contract management", "sales reporting", "commission calculation"
+            ],
+            "it": [
+                "access management", "incident management", "change management",
+                "service request", "problem management", "release management"
+            ]
+        }
+        # Complexity indicators for requirements
+        self.complexity_indicators = {
+            "high": [
+                "complex", "multiple systems", "integration", "decision tree",
+                "exception handling", "compliance", "regulatory", "manual review",
+                "approval workflow", "conditional logic", "business rules"
+            ],
+            "medium": [
+                "validation", "verification", "notification", "alert",
+                "scheduled", "reporting", "dashboard", "data transformation"
+            ],
+            "low": [
+                "simple", "straightforward", "data entry", "form filling",
+                "standard", "single system", "fixed path", "static rules"
+            ]
+        }
+    def _compile_requirement_patterns(self) -> None:
+        """Compile regex patterns for requirement extraction."""
+        # Action patterns
+        self.action_patterns = [
+            r"(?:need|should|must|will|shall) (?:to )?([a-z]+)",
+            r"responsible for ([a-z]+ing)",
+            r"capability to ([a-z]+)",
+            r"ability to ([a-z]+)"
+        ]
+        # System patterns
+        self.system_patterns = [
+            r"(?:in|from|to|using|within) (?:the )?([A-Za-z0-9]+)(?: system| application| platform| software| tool)?",
+            r"([A-Za-z0-9]+)(?: system| application| platform| software| tool)",
+            r"([A-Za-z0-9]+) (?:database|interface|API|server)"
+        ]
+        # Frequency patterns
+        self.frequency_patterns = [
+            r"(daily|weekly|monthly|quarterly|yearly|annually)",
+            r"every ([0-9]+) (day|week|month|quarter|year)s?",
+            r"([0-9]+) times per (day|week|month|year)"
+        ]
+        # Compile all patterns
+        self.action_regex = [re.compile(pattern) for pattern in self.action_patterns]
+        self.system_regex = [re.compile(pattern) for pattern in self.system_patterns]
+        self.frequency_regex = [re.compile(pattern) for pattern in self.frequency_patterns]
+    def analyze_text_requirement(self, requirement_text: str, source: str = None) -> Dict:
+        """
+        Analyze a natural language requirement and extract structured information.
+        Args:
+            requirement_text: The text of the requirement
+            source: Source of the requirement
+        Returns:
+            Dictionary with extracted requirement information
+        """
+        # Parse with spaCy
+        doc = self.nlp(requirement_text)
+        # Basic requirement object
+        requirement = {
+            "id": f"REQ-{len(self.requirements) + 1}",
+            "text": requirement_text,
+            "source": source,
+            "extracted": {
+                "actions": self._extract_actions(doc, requirement_text),
+                "systems": self._extract_systems(doc, requirement_text),
+                "frequency": self._extract_frequency(requirement_text),
+                "business_domain": self._classify_business_domain(doc),
+                "complexity": self._assess_complexity(doc, requirement_text),
+                "data_elements": self._extract_data_elements(doc)
+            },
+            "automation_potential": None  # Will be filled later
+        }
+        # Store the requirement
+        self.requirements.append(requirement)
+        return requirement
+    def _extract_actions(self, doc, text: str) -> List[str]:
+        """
+        Extract action verbs from requirement text.
+        Args:
+            doc: spaCy processed document
+            text: Original text
+        Returns:
+            List of action verbs
+        """
+        # Method 1: Use spaCy to find verbs
+        verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
+        # Method 2: Use regex patterns
+        pattern_matches = []
+        for pattern in self.action_regex:
+            matches = pattern.findall(text.lower())
+            pattern_matches.extend(matches)
+        # Combine and deduplicate
+        all_actions = list(set(verbs + pattern_matches))
+        # Filter out common non-action verbs
+        stopwords = ["be", "is", "are", "was", "were", "have", "has", "had"]
+        filtered_actions = [v for v in all_actions if v not in stopwords and len(v) > 2]
+        return filtered_actions
+    def _extract_systems(self, doc, text: str) -> List[str]:
+        """
+        Extract system names from requirement text.
+        Args:
+            doc: spaCy processed document
+            text: Original text
+        Returns:
+            List of system names
+        """
+        # Method 1: Named Entity Recognition for PRODUCT entities
+        ner_systems = [ent.text for ent in doc.ents
+                       if ent.label_ in ["PRODUCT", "ORG", "GPE"]]
+        # Method 2: Pattern matching
+        pattern_systems = []
+        for pattern in self.system_regex:
+            matches = pattern.findall(text)
+            pattern_systems.extend(matches)
+        # Combine results
+        all_systems = list(set(ner_systems + pattern_systems))
+        # Filter out common false positives
+        stopwords = ["system", "process", "application", "data", "information", "this", "the"]
+        filtered_systems = [s for s in all_systems if s.lower() not in stopwords and len(s) > 2]
+        return filtered_systems
+    def _extract_frequency(self, text: str) -> Optional[str]:
+        """
+        Extract frequency information from requirement text.
+        Args:
+            text: Requirement text
+        Returns:
+            Extracted frequency or None
+        """
+        text_lower = text.lower()
+        # Check all frequency patterns
+        for pattern in self.frequency_regex:
+            match = pattern.search(text_lower)
+            if match:
+                return match.group(0)
+        # Check for specific frequency words
+        frequency_words = ["daily", "weekly", "monthly", "quarterly", "annually", "yearly"]
+        for word in frequency_words:
+            if word in text_lower:
+                return word
+        return None
+    def _classify_business_domain(self, doc) -> List[Tuple[str, float]]:
+        """
+        Classify the business domain of the requirement.
+        Args:
+            doc: spaCy processed document
+        Returns:
+            List of (domain, confidence) tuples
+        """
+        text = doc.text.lower()
+        domain_scores = {}
+        # Calculate score for each domain based on keyword matches
+        for domain, keywords in self.process_taxonomy.items():
+            domain_score = 0
+            for keyword in keywords:
+                if keyword in text:
+                    domain_score += 1
+            if domain_score > 0:
+                # Normalize by number of keywords
+                domain_scores[domain] = domain_score / len(keywords)
+        # If no direct matches, use semantic similarity
+        if not domain_scores:
+            for domain, keywords in self.process_taxonomy.items():
+                # Calculate average similarity between doc and each keyword
+                similarities = [doc.similarity(self.nlp(keyword)) for keyword in keywords]
+                avg_similarity = sum(similarities) / len(similarities) if similarities else 0
+                if avg_similarity > 0.5:  # Threshold for relevance
+                    domain_scores[domain] = avg_similarity
+        # Sort by score and return
+        sorted_domains = sorted(domain_scores.items(), key=lambda x: x[1], reverse=True)
+        return sorted_domains
+    def _assess_complexity(self, doc, text: str) -> str:
+        """
+        Assess the complexity of the requirement.
+        Args:
+            doc: spaCy processed document
+            text: Original text
+        Returns:
+            Complexity level ("high", "medium", or "low")
+        """
+        text_lower = text.lower()
+        # Count indicators for each complexity level
+        scores = {level: 0 for level in self.complexity_indicators.keys()}
+        for level, indicators in self.complexity_indicators.items():
+            for indicator in indicators:
+                if indicator in text_lower:
+                    scores[level] += 1
+        # Check sentence structure complexity
+        sentence_count = len(list(doc.sents))
+        avg_tokens_per_sentence = len(doc) / sentence_count if sentence_count > 0 else 0
+        # Adjust scores based on structural complexity
+        if avg_tokens_per_sentence > 25:
+            scores["high"] += 1
+        elif avg_tokens_per_sentence > 15:
+            scores["medium"] += 1
+        # Check for conditional statements (if/then)
+        if "if" in text_lower and ("then" in text_lower or "else" in text_lower):
+            scores["high"] += 1
+        # Determine final complexity
+        if scores["high"] > 0:
+            return "high"
+        elif scores["medium"] > 0:
+            return "medium"
+        else:
+            return "low"
+    def _extract_data_elements(self, doc) -> List[str]:
+        """
+        Extract data elements from the requirement text.
+        Args:
+            doc: spaCy processed document
+        Returns:
+            List of data elements
+        """
+        # Find noun chunks that could be data elements
+        data_elements = []
+        for chunk in doc.noun_chunks:
+            # Check if this looks like a data field
+            if (any(token.pos_ == "NOUN" for token in chunk) and
+                len(chunk) <= 4 and  # Not too long
+                not any(token.is_stop for token in chunk)):  # Not all stopwords
+                data_elements.append(chunk.text)
+        # Look for specific data patterns
+        data_patterns = [
+            (r"\b[A-Z][a-z]+ ID\b", "ID field"),
+            (r"\b[A-Z][a-z]+ Number\b", "Number field"),
+            (r"\b[A-Z][a-z]+ Code\b", "Code field"),
+            (r"\b[A-Z][a-z]+ Date\b", "Date field"),
+            (r"\bstatus\b", "Status field")
+        ]
+        for pattern, field_type in data_patterns:
+            if re.search(pattern, doc.text):
+                data_elements.append(field_type)
+        return list(set(data_elements))
+    def analyze_requirements_batch(self, requirements: List[Dict]) -> List[Dict]:
+        """
+        Analyze a batch of requirements and find relationships between them.
+        Args:
+            requirements: List of requirement dictionaries with 'text' field
+        Returns:
+            List of analyzed requirements
+        """
+        # Process each requirement
+        processed_requirements = []
+        for req in requirements:
+            req_text = req.get('text', '')
+            source = req.get('source', 'batch')
+            processed = self.analyze_text_requirement(req_text, source)
+            processed_requirements.append(processed)
+        # Find relationships between requirements
+        self._find_requirement_relationships(processed_requirements)
+        return processed_requirements
+    def _find_requirement_relationships(self, requirements: List[Dict]) -> None:
+        """
+        Find and add relationships between requirements.
+        Args:
+            requirements: List of processed requirements
+        """
+        if len(requirements) < 2:
+            return
+        # Extract text from requirements
+        texts = [req["text"] for req in requirements]
+        # Create TF-IDF matrix
+        vectorizer = TfidfVectorizer(stop_words='english')
+        tfidf_matrix = vectorizer.fit_transform(texts)
+        # Calculate similarity matrix
+        similarity_matrix = cosine_similarity(tfidf_matrix)
+        # Add relationships to requirements
+        for i, req in enumerate(requirements):
+            related = []
+            for j