berangerthomas commited on
Commit
9616cfb
·
1 Parent(s): a5c3b6f

parsing dates with dateutils

Browse files
Files changed (1) hide show
  1. utils/log2pandas.py +14 -41
utils/log2pandas.py CHANGED
@@ -1,15 +1,13 @@
1
- from datetime import datetime
2
-
3
  import pandas as pd
 
4
 
5
  from config.log_definitions import log_definitions
6
 
7
 
8
  class LogParser:
9
  """
10
- Une classe qui prend en argument le chemin d'un fichier log et une définition
11
- de log (par exemple issue de log_definitions), puis parse le fichier et renvoie
12
- un DataFrame pandas contenant les données extraites.
13
  """
14
 
15
  def __init__(self, file_path, log_type):
@@ -17,11 +15,11 @@ class LogParser:
17
  self.log_definition = log_definitions[log_type]
18
 
19
  def parse_line(self, line):
20
- """Parse une ligne du fichier log en utilisant la définition fournie."""
21
- # Commencer par remplacer les [ et ] par des espaces
22
  line = line.replace("[", " ").replace("]", " ")
23
  tokens = line.strip().split()
24
- # On ignore la ligne si elle ne contient pas assez de tokens
25
  if len(tokens) < len(self.log_definition["fields"]):
26
  return None
27
 
@@ -29,7 +27,7 @@ class LogParser:
29
  for field in self.log_definition["fields"]:
30
  pos = field["pos"]
31
 
32
- # Extraction de la valeur selon la position indiquée
33
  if isinstance(pos, slice):
34
  value = " ".join(tokens[pos])
35
  else:
@@ -38,40 +36,15 @@ class LogParser:
38
  except IndexError:
39
  value = None
40
 
41
- # Conversion du type
42
  if "type" in field:
43
  typ = field["type"]
44
  if typ == "datetime":
45
- formats = [
46
- "%a %b %d %H:%M:%S %Y", # Format typique
47
- "%Y-%m-%d %H:%M:%S", # ISO-like format
48
- "%d/%m/%Y %H:%M:%S", # European format
49
- "%m/%d/%Y %H:%M:%S", # US format
50
- "%Y%m%d%H%M%S", # Compact format
51
- "%Y-%m-%dT%H:%M:%S", # ISO format
52
- "%Y-%m-%dT%H:%M:%S.%f", # ISO with microseconds
53
- "%b %d %H:%M:%S", # Jun 14 15:16:01
54
- ]
55
-
56
- for date_format in formats:
57
- try:
58
- # Si l'année n'est pas présente dans le format,
59
- # on l'ajoute en utilisant l'année actuelle
60
- if "%Y" not in date_format:
61
- # Add current year to the date string
62
- current_year = datetime.now().year
63
- value_with_year = f"{value} {current_year}"
64
- # Add year to format string
65
- format_with_year = f"{date_format} %Y"
66
- value = datetime.strptime(
67
- value_with_year, format_with_year
68
- )
69
- else:
70
- value = datetime.strptime(value, date_format)
71
- break
72
- except ValueError:
73
- continue
74
- else: # No formats matched
75
  value = None
76
  elif typ == "direction":
77
  value = "download" if value == "o" else "upload"
@@ -86,7 +59,7 @@ class LogParser:
86
  return entry
87
 
88
  def parse_file(self):
89
- """Parcourt tout le fichier log et renvoie un DataFrame pandas contenant les entrées parse."""
90
  data = []
91
  with open(self.file_path, "r") as f:
92
  for line in f:
 
 
 
1
  import pandas as pd
2
+ from dateutil.parser import parse
3
 
4
  from config.log_definitions import log_definitions
5
 
6
 
7
  class LogParser:
8
  """
9
+ A class that takes a log file path and a log definition (for example from log_definitions),
10
+ then parses the file and returns a pandas DataFrame containing the extracted data.
 
11
  """
12
 
13
  def __init__(self, file_path, log_type):
 
15
  self.log_definition = log_definitions[log_type]
16
 
17
  def parse_line(self, line):
18
+ """Parse a line from the log file using the provided definition."""
19
+ # Start by replacing [ and ] with spaces
20
  line = line.replace("[", " ").replace("]", " ")
21
  tokens = line.strip().split()
22
+ # Ignore the line if it does not contain enough tokens
23
  if len(tokens) < len(self.log_definition["fields"]):
24
  return None
25
 
 
27
  for field in self.log_definition["fields"]:
28
  pos = field["pos"]
29
 
30
+ # Extract the value according to the indicated position
31
  if isinstance(pos, slice):
32
  value = " ".join(tokens[pos])
33
  else:
 
36
  except IndexError:
37
  value = None
38
 
39
+ # Type conversion
40
  if "type" in field:
41
  typ = field["type"]
42
  if typ == "datetime":
43
+ # Try to parse the date with dateutil.parser
44
+ try:
45
+ value = parse(value)
46
+ except ValueError:
47
+ # If the date is not parsable, try several formats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  value = None
49
  elif typ == "direction":
50
  value = "download" if value == "o" else "upload"
 
59
  return entry
60
 
61
  def parse_file(self):
62
+ """Iterate through the entire log file and return a pandas DataFrame containing the parsed entries."""
63
  data = []
64
  with open(self.file_path, "r") as f:
65
  for line in f: