Electro-Sutra / text_extractor.py
RohanAi's picture
Upload 11 files
8feda73 verified
import docx
import csv
import io # Used for handling string as file-like object for IPC/D356 summaries
def extract_text_from_docx(filepath):
"""Extracts all text from a .docx file."""
doc = docx.Document(filepath)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return "\n".join(full_text)
def extract_text_from_csv(filepath):
"""Extracts and formats text from a CSV file."""
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
header = next(reader) # Read header
csv_data = [", ".join(header)] # Start with header
for row in reader:
csv_data.append(", ".join(row))
return "\n".join(csv_data)
def summarize_ipc_or_d356_data(filepath, file_type):
"""
Placeholder for summarizing IPC or D356 data.
In a real application, this would involve a dedicated parser.
For demonstration, we'll return a placeholder string.
"""
if file_type == 'ipc':
# Imagine complex parsing here, identifying ICs, connectors, critical nets etc.
# For now, it's a conceptual summary.
return f"Summary of {filepath} (IPC design data):\n" \
"- Identifies major components like U1 (MCU), U2 (Power IC), J1 (USB connector).\n" \
"- Provides connectivity information for power rails (VCC, GND) and data lines (e.g., I2C on U1 pins 3,4).\n" \
"- Indicates board dimensions and layer stackup.\n" \
"- Critical components: U1 (Microcontroller), U2 (Voltage Regulator), Q1 (MOSFET)."
elif file_type == 'd356':
# Imagine parsing test points, nets to check, etc.
return f"Summary of {filepath} (D356 test data):\n" \
"- Lists test points: TP1 (VCC_3V3), TP2 (GND), TP3 (U1_I2C_SDA).\n" \
"- Indicates net connectivity for electrical tests.\n" \
"- Specifies areas for visual inspection related to component placement."
else:
return f"Could not summarize unknown file type: {filepath}"
# --- File Paths (Update these to your actual file paths) ---
manual_path = "inputFiles/Clemson_HW_Spec_V4_092325.docx"
sample_output_path = "Example Output Files/AE304196-001_LoRa Car Radio Bring-Up Procedure.docx"
ipc_design_path = "inputFiles/Assembly Testpoint Report for Car-PCB1.ipc" # This is conceptual for direct LLM input
# Test Data (for the new device you want to generate test cases for)
new_device_d356_path = "Test Case Files/UNO-TH_Rev3e.d356" # This is conceptual for direct LLM input
new_device_bom_path = "Example Output Files/newModelBom.csv"
# --- 1. Extract Text from Existing Files ---
print("Extracting text from files...")
try:
manual_text = extract_text_from_docx(manual_path)
print(f"Extracted {len(manual_text)} characters from manual.")
except Exception as e:
manual_text = "ERROR: Could not read PCB Manual. Ensure file exists and is a valid .docx."
print(manual_text, e)
try:
sample_output_text = extract_text_from_docx(sample_output_path)
print(f"Extracted {len(sample_output_text)} characters from sample output.")
except Exception as e:
sample_output_text = "ERROR: Could not read Sample Output. Ensure file exists and is a valid .docx."
print(sample_output_text, e)
# Summarize IPC and D356 (conceptual for LLM input)
# In a real scenario, you'd run external tools or parsers here.
ipc_summary = summarize_ipc_or_d356_data(ipc_design_path, 'ipc')
print(f"Generated IPC summary (conceptual): {len(ipc_summary)} characters.")
d356_summary = summarize_ipc_or_d356_data(new_device_d356_path, 'd356')
print(f"Generated D356 summary (conceptual): {len(d356_summary)} characters.")
try:
new_device_bom_text = extract_text_from_csv(new_device_bom_path)
print(f"Extracted {len(new_device_bom_text)} characters from new device BOM.")
except Exception as e:
new_device_bom_text = "ERROR: Could not read New Device BOM. Ensure file exists and is a valid .csv."
print(new_device_bom_text, e)