Spaces:
Running
Running
File size: 2,748 Bytes
39028c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | import os
import json
import logging
from typing import List, Dict, Any
from pathlib import Path
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def load_config(config_path: str) -> Dict[str, Any]:
"""
Load configuration from JSON file.
Args:
config_path: Path to configuration file
Returns:
Configuration dictionary
"""
try:
with open(config_path, 'r') as f:
config = json.load(f)
logger.info(f"Configuration loaded from {config_path}")
return config
except FileNotFoundError:
logger.error(f"Configuration file not found: {config_path}")
return {}
def save_config(config: Dict[str, Any], config_path: str) -> None:
"""
Save configuration to JSON file.
Args:
config: Configuration dictionary
config_path: Path to save configuration
"""
Path(config_path).parent.mkdir(parents=True, exist_ok=True)
with open(config_path, 'w') as f:
json.dump(config, f, indent=4)
logger.info(f"Configuration saved to {config_path}")
def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
"""
Split text into overlapping chunks.
Args:
text: Input text to chunk
chunk_size: Size of each chunk
overlap: Overlap between consecutive chunks
Returns:
List of text chunks
"""
chunks = []
step = chunk_size - overlap
for i in range(0, len(text), step):
chunk = text[i:i + chunk_size]
if len(chunk) > 0:
chunks.append(chunk)
logger.info(f"Text split into {len(chunks)} chunks")
return chunks
def merge_chunks(chunks: List[str], overlap: int = 50) -> str:
"""
Merge overlapping text chunks back into single text.
Args:
chunks: List of text chunks
overlap: Original overlap size
Returns:
Merged text
"""
if not chunks:
return ""
merged = chunks[0]
for chunk in chunks[1:]:
# Remove overlapping portion
merged += chunk[overlap:]
return merged
def get_file_size(file_path: str) -> int:
"""Get file size in bytes."""
return os.path.getsize(file_path)
def count_tokens_approximate(text: str) -> int:
"""
Approximate token count using word-based heuristic.
For more accurate counting, use tokenizer from transformers library.
Args:
text: Input text
Returns:
Approximate token count
"""
# Rough estimate: 1 token ≈ 4 characters
return len(text) // 4
|