Spaces:

RohanAi
/

Electro-Sutra

Sleeping

App Files Files Community

Electro-Sutra / text_extractor.py

RohanAi

Upload 11 files

8feda73 verified 4 months ago

raw

history blame contribute delete

4.12 kB

	import docx
	import csv
	import io # Used for handling string as file-like object for IPC/D356 summaries

	def extract_text_from_docx(filepath):
	"""Extracts all text from a .docx file."""
	doc = docx.Document(filepath)
	full_text = []
	for para in doc.paragraphs:
	full_text.append(para.text)
	return "\n".join(full_text)

	def extract_text_from_csv(filepath):
	"""Extracts and formats text from a CSV file."""
	with open(filepath, 'r', encoding='utf-8') as f:
	reader = csv.reader(f)
	header = next(reader) # Read header
	csv_data = [", ".join(header)] # Start with header
	for row in reader:
	csv_data.append(", ".join(row))
	return "\n".join(csv_data)

	def summarize_ipc_or_d356_data(filepath, file_type):
	"""
	Placeholder for summarizing IPC or D356 data.
	In a real application, this would involve a dedicated parser.
	For demonstration, we'll return a placeholder string.
	"""
	if file_type == 'ipc':
	# Imagine complex parsing here, identifying ICs, connectors, critical nets etc.
	# For now, it's a conceptual summary.
	return f"Summary of {filepath} (IPC design data):\n" \
	"- Identifies major components like U1 (MCU), U2 (Power IC), J1 (USB connector).\n" \
	"- Provides connectivity information for power rails (VCC, GND) and data lines (e.g., I2C on U1 pins 3,4).\n" \
	"- Indicates board dimensions and layer stackup.\n" \
	"- Critical components: U1 (Microcontroller), U2 (Voltage Regulator), Q1 (MOSFET)."
	elif file_type == 'd356':
	# Imagine parsing test points, nets to check, etc.
	return f"Summary of {filepath} (D356 test data):\n" \
	"- Lists test points: TP1 (VCC_3V3), TP2 (GND), TP3 (U1_I2C_SDA).\n" \
	"- Indicates net connectivity for electrical tests.\n" \
	"- Specifies areas for visual inspection related to component placement."
	else:
	return f"Could not summarize unknown file type: {filepath}"


	# --- File Paths (Update these to your actual file paths) ---
	manual_path = "inputFiles/Clemson_HW_Spec_V4_092325.docx"
	sample_output_path = "Example Output Files/AE304196-001_LoRa Car Radio Bring-Up Procedure.docx"
	ipc_design_path = "inputFiles/Assembly Testpoint Report for Car-PCB1.ipc" # This is conceptual for direct LLM input

	# Test Data (for the new device you want to generate test cases for)
	new_device_d356_path = "Test Case Files/UNO-TH_Rev3e.d356" # This is conceptual for direct LLM input
	new_device_bom_path = "Example Output Files/newModelBom.csv"

	# --- 1. Extract Text from Existing Files ---
	print("Extracting text from files...")
	try:
	manual_text = extract_text_from_docx(manual_path)
	print(f"Extracted {len(manual_text)} characters from manual.")
	except Exception as e:
	manual_text = "ERROR: Could not read PCB Manual. Ensure file exists and is a valid .docx."
	print(manual_text, e)

	try:
	sample_output_text = extract_text_from_docx(sample_output_path)
	print(f"Extracted {len(sample_output_text)} characters from sample output.")
	except Exception as e:
	sample_output_text = "ERROR: Could not read Sample Output. Ensure file exists and is a valid .docx."
	print(sample_output_text, e)

	# Summarize IPC and D356 (conceptual for LLM input)
	# In a real scenario, you'd run external tools or parsers here.
	ipc_summary = summarize_ipc_or_d356_data(ipc_design_path, 'ipc')
	print(f"Generated IPC summary (conceptual): {len(ipc_summary)} characters.")

	d356_summary = summarize_ipc_or_d356_data(new_device_d356_path, 'd356')
	print(f"Generated D356 summary (conceptual): {len(d356_summary)} characters.")

	try:
	new_device_bom_text = extract_text_from_csv(new_device_bom_path)
	print(f"Extracted {len(new_device_bom_text)} characters from new device BOM.")
	except Exception as e:
	new_device_bom_text = "ERROR: Could not read New Device BOM. Ensure file exists and is a valid .csv."
	print(new_device_bom_text, e)