File size: 3,622 Bytes
f1b19d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""

Text Extractor Tool - Clean, summarize, and process text

"""
import logging
from typing import Dict, Any
import sys
import os

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils.helpers import clean_text, chunk_text, summarize_text, extract_keywords

logger = logging.getLogger(__name__)


def extract_text(text: str, operation: str = "clean", max_length: int = 500) -> Dict[str, Any]:
    """

    Process text based on the specified operation.

    

    Args:

        text: Raw text to process

        operation: Operation to perform - 'clean', 'summarize', 'chunk', or 'keywords'

        max_length: Maximum length for summary operations

        

    Returns:

        Dictionary containing processed text and metadata

    """
    try:
        if not text or not text.strip():
            raise ValueError("Input text is empty")
        
        result = ""
        metadata = {}
        
        if operation == "clean":
            result = clean_text(text)
            metadata = {
                "operation": "clean",
                "original_length": len(text),
                "cleaned_length": len(result)
            }
            
        elif operation == "summarize":
            result = summarize_text(text, max_length)
            metadata = {
                "operation": "summarize",
                "original_length": len(text),
                "summary_length": len(result),
                "compression_ratio": round(len(result) / len(text), 2) if len(text) > 0 else 0
            }
            
        elif operation == "chunk":
            chunks = chunk_text(text, chunk_size=max_length, overlap=50)
            result = "\n\n---CHUNK---\n\n".join(chunks)
            metadata = {
                "operation": "chunk",
                "total_chunks": len(chunks),
                "chunk_size": max_length
            }
            
        elif operation == "keywords":
            keywords = extract_keywords(text, top_n=10)
            result = ", ".join(keywords)
            metadata = {
                "operation": "keywords",
                "keyword_count": len(keywords),
                "keywords": keywords
            }
            
        else:
            raise ValueError(f"Unknown operation: {operation}. Use 'clean', 'summarize', 'chunk', or 'keywords'")
        
        # Calculate word count
        word_count = len(result.split())
        
        return {
            "result": result,
            "word_count": word_count,
            "metadata": metadata
        }
        
    except Exception as e:
        logger.error(f"Error extracting text: {e}")
        raise


def process_multiple_texts(texts: list, operation: str = "clean") -> list:
    """

    Process multiple texts with the same operation.

    

    Args:

        texts: List of text strings to process

        operation: Operation to apply to all texts

        

    Returns:

        List of results for each text

    """
    results = []
    for idx, text in enumerate(texts):
        try:
            result = extract_text(text, operation)
            result["index"] = idx
            results.append(result)
        except Exception as e:
            logger.error(f"Error processing text at index {idx}: {e}")
            results.append({
                "index": idx,
                "error": str(e),
                "result": "",
                "word_count": 0
            })
    
    return results