Spaces:
Sleeping
Sleeping
File size: 3,622 Bytes
f1b19d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
"""
Text Extractor Tool - Clean, summarize, and process text
"""
import logging
from typing import Dict, Any
import sys
import os
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.helpers import clean_text, chunk_text, summarize_text, extract_keywords
logger = logging.getLogger(__name__)
def extract_text(text: str, operation: str = "clean", max_length: int = 500) -> Dict[str, Any]:
"""
Process text based on the specified operation.
Args:
text: Raw text to process
operation: Operation to perform - 'clean', 'summarize', 'chunk', or 'keywords'
max_length: Maximum length for summary operations
Returns:
Dictionary containing processed text and metadata
"""
try:
if not text or not text.strip():
raise ValueError("Input text is empty")
result = ""
metadata = {}
if operation == "clean":
result = clean_text(text)
metadata = {
"operation": "clean",
"original_length": len(text),
"cleaned_length": len(result)
}
elif operation == "summarize":
result = summarize_text(text, max_length)
metadata = {
"operation": "summarize",
"original_length": len(text),
"summary_length": len(result),
"compression_ratio": round(len(result) / len(text), 2) if len(text) > 0 else 0
}
elif operation == "chunk":
chunks = chunk_text(text, chunk_size=max_length, overlap=50)
result = "\n\n---CHUNK---\n\n".join(chunks)
metadata = {
"operation": "chunk",
"total_chunks": len(chunks),
"chunk_size": max_length
}
elif operation == "keywords":
keywords = extract_keywords(text, top_n=10)
result = ", ".join(keywords)
metadata = {
"operation": "keywords",
"keyword_count": len(keywords),
"keywords": keywords
}
else:
raise ValueError(f"Unknown operation: {operation}. Use 'clean', 'summarize', 'chunk', or 'keywords'")
# Calculate word count
word_count = len(result.split())
return {
"result": result,
"word_count": word_count,
"metadata": metadata
}
except Exception as e:
logger.error(f"Error extracting text: {e}")
raise
def process_multiple_texts(texts: list, operation: str = "clean") -> list:
"""
Process multiple texts with the same operation.
Args:
texts: List of text strings to process
operation: Operation to apply to all texts
Returns:
List of results for each text
"""
results = []
for idx, text in enumerate(texts):
try:
result = extract_text(text, operation)
result["index"] = idx
results.append(result)
except Exception as e:
logger.error(f"Error processing text at index {idx}: {e}")
results.append({
"index": idx,
"error": str(e),
"result": "",
"word_count": 0
})
return results
|