Spaces:

Vara1605454
/

ImageCaptioningProject

Build error

ImageCaptioningProject / text_files /file.py

Varsha Dewangan

Initial clean commit for project deployment

ee1d4aa 8 months ago

2.2 kB

	import re
	import pandas as pd
	import os

	# Define a list of your log file names
	log_files = [
	'training (2).txt',
	'training_log_1_18.txt',
	'training_log_17_27.txt',
	'training_log_21_30.txt'

	]

	# Create an empty list to store parsed data
	parsed_data = []

	# Regex to capture Epoch, Step, Loss, and Perplexity
	# This regex looks for lines containing 'Epoch [X/Y], Step [A/B], Loss: V, Perplexity: W'
	log_pattern = re.compile(
	r"Epoch\s\[(\d+)/\d+\],\sStep\s\[(\d+)/\d+\],\sLoss:\s([\d.]+),\sPerplexity:\s([\d.]+)"
	)

	print("Starting log parsing...")

	# Loop through each log file
	for file_name in log_files:
	if not os.path.exists(file_name):
	print(f"Warning: File not found - {file_name}. Skipping.")
	continue

	print(f"Processing {file_name}...")
	# with open(file_name, 'r') as f:
	with open(file_name, 'r', encoding='utf-8') as f:
	for line in f:
	match = log_pattern.search(line)
	if match:
	# Extracting values. Group 1: Epoch, Group 2: Step, Group 3: Loss, Group 4: Perplexity
	epoch = int(match.group(1))
	step = int(match.group(2))
	loss = float(match.group(3))
	perplexity = float(match.group(4)) # Correctly assigned to 'perplexity'

	# Append to our list of dictionaries
	parsed_data.append({
	'Epoch': epoch,
	'Step': step,
	'Loss': loss,
	'Perplexity': perplexity # Ensure this key matches the variable name
	})

	# Create a Pandas DataFrame from the parsed data
	df = pd.DataFrame(parsed_data)

	# Sort the data by Epoch and Step to ensure correct chronological order
	df_sorted = df.sort_values(by=['Epoch', 'Step']).reset_index(drop=True)

	# Save the DataFrame to a CSV file
	output_csv_file = 'training_metrics.csv'
	df_sorted.to_csv(output_csv_file, index=False)

	print(f"\nSuccessfully parsed logs and saved data to {output_csv_file}")
	print("You can now import this CSV file into Power BI to create your visualizations.")
	print("\nFirst few rows of the generated CSV:")
	print(df_sorted.head())