Spaces:

VcRlAgent
/

workwise-backend-gpu

Sleeping

File size: 3,318 Bytes

"""Data ingestion service for parsing Jira exports"""
import pandas as pd
import json
from typing import List, Dict, Any
from pathlib import Path
from app.utils.logger import setup_logger

logger = setup_logger(__name__)

class DataIngestionService:
    """Handles parsing and preprocessing of Jira data files"""
    
    @staticmethod
    def parse_csv(file_path: str) -> List[Dict[str, Any]]:
        """Parse Jira CSV export"""
        try:
            df = pd.read_csv(file_path)
            logger.info(f"Loaded {len(df)} records from {file_path}")
            
            # Normalize column names
            df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
            
            # Convert to list of dictionaries
            records = df.to_dict('records')
            
            # Clean and structure data
            processed_records = []
            for record in records:
                processed = DataIngestionService._clean_record(record)
                processed_records.append(processed)
            
            return processed_records
        
        except Exception as e:
            logger.error(f"Error parsing CSV: {str(e)}")
            raise
    
    @staticmethod
    def parse_json(file_path: str) -> List[Dict[str, Any]]:
        """Parse Jira JSON export"""
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            if isinstance(data, dict) and 'issues' in data:
                records = data['issues']
            elif isinstance(data, list):
                records = data
            else:
                raise ValueError("Unexpected JSON structure")
            
            logger.info(f"Loaded {len(records)} records from {file_path}")
            return [DataIngestionService._clean_record(r) for r in records]
        
        except Exception as e:
            logger.error(f"Error parsing JSON: {str(e)}")
            raise
    
    @staticmethod
    def _clean_record(record: Dict[str, Any]) -> Dict[str, Any]:
        """Clean and normalize a single record"""
        # Handle missing values
        for key, value in record.items():
            if pd.isna(value) or value == '' or value == 'None':
                record[key] = None
        
        # Create searchable text representation
        #text_fields = ['summary', 'description', 'status', 'priority', 'project']
        text_fields = ['summary', 'description', 'status', 'priority', 'project','issue_type', 'component', 'module', 'symptom_severity','assignee', 'reporter']

        text_parts = []
        
        for field in text_fields:
            if field in record and record[field]:
                text_parts.append(f"{field}: {record[field]}")
        
        record['searchable_text'] = " | ".join(text_parts)
        
        return record
    
    @staticmethod
    def load_data(file_path: str) -> List[Dict[str, Any]]:
        """Load data from file (auto-detect format)"""
        file_ext = Path(file_path).suffix.lower()
        
        if file_ext == '.csv':
            return DataIngestionService.parse_csv(file_path)
        elif file_ext == '.json':
            return DataIngestionService.parse_json(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_ext}")