File size: 10,449 Bytes
b8ab4a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 |
import json
import yaml
import sympy
from sympy.parsing.latex import parse_latex
from huggingface_hub import hf_hub_download
from pathlib import Path
import jsonlines
from typing import Dict, List, Any
from config import DATASETS, DATA_PROCESSING
class MathDataProcessor:
def __init__(self):
self.processed_data = []
self.dataset_paths = {}
self.math_operations = {
"differentiation": self._process_differentiation,
"integration": self._process_integration,
"limits": self._process_limits,
"simplification": self._process_simplification,
"matrix": self._process_matrix,
"probability": self._process_probability,
"statistics": self._process_statistics
}
def download_dataset(self, dataset_name: str) -> Path:
"""Download dataset from Hugging Face"""
if dataset_name not in DATASETS:
raise ValueError(f"Dataset {dataset_name} not defined in configuration")
dataset_config = DATASETS[dataset_name]
dataset_path = Path(f"data/{dataset_name}")
# Download from Hugging Face
hf_hub_download(
repo_id=dataset_config["dataset_name"],
filename=f"{dataset_config['split']}.jsonl",
local_dir=dataset_path
)
self.dataset_paths[dataset_name] = dataset_path
return dataset_path
def normalize_equation(self, equation: str) -> str:
"""Normalize mathematical equations using sympy"""
try:
# Try to parse LaTeX first
if "\\" in equation:
eq = parse_latex(equation)
else:
eq = sympy.sympify(equation)
return str(eq)
except:
return equation
def process_proof_steps(self, steps: List[str]) -> List[Dict[str, str]]:
"""Process proof steps into structured format"""
processed_steps = []
for step in steps:
try:
# Try to parse as YAML if it contains structured data
structured_step = yaml.safe_load(step)
if isinstance(structured_step, dict):
processed_steps.append(structured_step)
else:
processed_steps.append({"step": step})
except:
processed_steps.append({"step": step})
return processed_steps
def _process_differentiation(self, expression: str) -> str:
"""Process and validate differentiation operations"""
x = sympy.Symbol('x')
try:
expr = sympy.sympify(expression)
derivative = sympy.diff(expr, x)
return str(derivative)
except:
return expression
def _process_integration(self, expression: str) -> str:
"""Process and validate integration operations"""
x = sympy.Symbol('x')
try:
expr = sympy.sympify(expression)
integral = sympy.integrate(expr, x)
return str(integral)
except:
return expression
def _process_limits(self, expression: str) -> str:
"""Process and validate limit operations"""
x = sympy.Symbol('x')
try:
expr = sympy.sympify(expression)
limit = sympy.limit(expr, x, sympy.oo)
return str(limit)
except:
return expression
def _process_simplification(self, expression: str) -> str:
"""Process and validate expression simplification"""
try:
expr = sympy.sympify(expression)
simplified = sympy.simplify(expr)
return str(simplified)
except:
return expression
def _process_matrix(self, matrix_str: str) -> str:
"""Process and validate matrix operations"""
try:
matrix = sympy.Matrix([[float(n) for n in row.split()]
for row in matrix_str.split(';')])
return str(matrix)
except:
return matrix_str
def _process_probability(self, problem: str) -> Dict:
"""Process probability problems and extract key parameters"""
try:
# Basic parsing for probability problems
if "probability" in problem.lower():
return {
"type": "probability",
"parameters": self._extract_parameters(problem),
"distribution": self._identify_distribution(problem)
}
return {"type": "unknown"}
except:
return {"type": "unknown"}
def _process_statistics(self, data: str) -> Dict:
"""Process statistical data and extract key metrics"""
try:
# Basic statistical processing
if "," in data:
numbers = [float(n) for n in data.split(',')]
return {
"mean": sum(numbers) / len(numbers),
"median": sorted(numbers)[len(numbers)//2],
"std_dev": self._calculate_std_dev(numbers)
}
return {"error": "Invalid data format"}
except:
return {"error": "Processing failed"}
def _extract_parameters(self, text: str) -> Dict:
"""Extract parameters from mathematical text"""
parameters = {}
# Basic parameter extraction logic
if "=" in text:
parts = text.split("=")
parameters["equation"] = parts[0].strip()
parameters["value"] = parts[1].strip()
return parameters
def _identify_distribution(self, text: str) -> str:
"""Identify probability distribution from text"""
distributions = {
"binomial": ["binomial", "bernoulli"],
"normal": ["normal", "gaussian"],
"poisson": ["poisson"],
"exponential": ["exponential"]
}
text_lower = text.lower()
for dist, keywords in distributions.items():
if any(keyword in text_lower for keyword in keywords):
return dist
return "unknown"
def _calculate_std_dev(self, numbers: List[float]) -> float:
"""Calculate standard deviation"""
mean = sum(numbers) / len(numbers)
variance = sum((x - mean) ** 2 for x in numbers) / len(numbers)
return variance ** 0.5
def process_math_operation(self, operation_type: str, content: str) -> Any:
"""Process a specific mathematical operation"""
if operation_type in self.math_operations:
return self.math_operations[operation_type](content)
return content
def validate_entry(self, entry: Dict[str, Any]) -> bool:
"""Enhanced validation with mathematical checks"""
steps = entry.get("steps", [])
text = entry.get("question", "") + entry.get("answer", "")
# Basic validation
if len(steps) < DATA_PROCESSING["validation"]["min_steps"]:
return False
if len(text) < DATA_PROCESSING["validation"]["min_length"]:
return False
# Mathematical validation
try:
# Check if equations are valid
if "equation" in entry:
sympy.sympify(entry["equation"])
# Check if steps follow logical progression
if len(steps) > 1:
for i in range(len(steps) - 1):
if not self._check_step_continuity(steps[i], steps[i+1]):
return False
# Check for circular logic in proofs
if "proof" in entry:
if not self._check_proof_validity(entry["proof"]):
return False
return True
except:
return False
def _check_step_continuity(self, step1: str, step2: str) -> bool:
"""Check if mathematical steps are logically connected"""
try:
# Basic check for logical progression
if "=" in step1 and "=" in step2:
s1 = step1.split("=")[1].strip()
s2 = step2.split("=")[0].strip()
return s1 == s2
return True
except:
return False
def _check_proof_validity(self, proof: str) -> bool:
"""Check if a proof is logically valid"""
# Basic proof validation
if "assume" in proof.lower() and "therefore" not in proof.lower():
return False
if "contradiction" in proof.lower() and "false" not in proof.lower():
return False
return True
def process_dataset(self, dataset_name: str):
"""Process a specific dataset according to its configuration"""
dataset_path = self.download_dataset(dataset_name)
dataset_config = DATASETS[dataset_name]
with jsonlines.open(dataset_path / f"{dataset_config['split']}.jsonl") as reader:
for entry in reader:
processed_entry = {}
# Process each field
for field in dataset_config["use_fields"]:
value = entry.get(field)
if value:
if field == "equation":
processed_entry[field] = self.normalize_equation(value)
elif field == "proof_steps":
processed_entry[field] = self.process_proof_steps(value)
else:
processed_entry[field] = value
# Validate and add if valid
if self.validate_entry(processed_entry):
self.processed_data.append(processed_entry)
def save_processed_data(self, output_path: str):
"""Save processed data to JSONL format"""
with jsonlines.open(output_path, mode='w') as writer:
writer.write_all(self.processed_data)
if __name__ == "__main__":
processor = MathDataProcessor()
# Process all defined datasets
for dataset in DATASETS.keys():
processor.process_dataset(dataset)
# Save processed data
output_path = "processed_data/math_expert_data.jsonl"
processor.save_processed_data(output_path)
|