DPACMAN / dpacman /data_tasks /visualize /visualizations.py
svincoff's picture
hydra restructure
a887ffc
Raw
History Blame Contribute Delete
3.56 kB
import pandas as pd
import random
import matplotlib.pyplot as plt
import glob
import re
from pathlib import Path
def trim_sequence(seq: str, seq_flanked: str, total_len: int):
"""
Return a substring of seq_flanked of length total_len that contains seq
at a random valid position. Also returns (upstream, downstream).
"""
i = seq_flanked.find(seq)
if i < 0:
raise ValueError(f"Motif '{seq}' not found in flanked sequence.")
motif_len = len(seq)
extra = total_len - motif_len
left_avail = i
right_avail = len(seq_flanked) - (i + motif_len)
if extra > left_avail + right_avail:
raise ValueError("Not enough flank to reach desired length.")
# decide upstream bases
min_left = max(0, extra - right_avail)
max_left = min(extra, left_avail)
upstream = random.randint(min_left, max_left)
downstream = extra - upstream
start = i - upstream
end = i + motif_len + downstream
return seq_flanked[start:end], upstream, downstream
def process_and_plot(input_csv: str, total_len: int, output_csv: Path, fig_dir: Path):
df = pd.read_csv(input_csv)
ups, downs, abs_pos, rel_pos = [], [], [], []
trimmed_seqs = []
for _, row in df.iterrows():
trimmed, u, d = trim_sequence(row["seq"], row["seq_flanked"], total_len)
trimmed_seqs.append(trimmed)
ups.append(u)
downs.append(d)
abs_pos.append(u)
rel_pos.append(u / (total_len - len(row["seq"])))
df_out = df.copy()
df_out["seq_trimmed"] = trimmed_seqs
df_out["motif_abs_start"] = abs_pos
df_out["motif_rel_pos"] = rel_pos
df_out.to_csv(output_csv, index=False)
basename = input_csv.stem
# Absolute position histogram
plt.figure(figsize=(6, 4))
plt.hist(df_out["motif_abs_start"], bins=50, edgecolor="k")
plt.title(f"{basename}: Absolute Motif Start")
plt.xlabel("Start Index (nt)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(fig_dir / f"{basename}_abs.png")
plt.close()
# Relative position histogram
plt.figure(figsize=(6, 4))
plt.hist(df_out["motif_rel_pos"], bins=50, edgecolor="k")
plt.title(f"{basename}: Relative Motif Position")
plt.xlabel("Relative Position")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(fig_dir / f"{basename}_rel.png")
plt.close()
if __name__ == "__main__":
# === USER SETTINGS ===
PATTERN = "/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/tfclust/hg38/encRegTfbsClustered_hg38_chr*.csv"
CHR_FILTER = re.compile(
r"encRegTfbsClustered_hg38_chr([1-9]|1[0-9]|2[0-2]|X|Y)\.csv$"
)
DESIRED_LEN = 1000
OUTPUT_DIR = Path("trimmed_csvs")
FIG_DIR = Path("figures")
# =====================
OUTPUT_DIR.mkdir(exist_ok=True)
FIG_DIR.mkdir(exist_ok=True)
# Clear old figures
for f in FIG_DIR.iterdir():
if f.is_file():
f.unlink()
# Gather files and filter to pure chr1-22, X, Y
all_files = glob.glob(PATTERN)
files = [Path(f) for f in all_files if CHR_FILTER.match(Path(f).name)]
if not files:
print(f"No matching chr1-22, X, Y files found (pattern={PATTERN}).")
exit(1)
for infile in sorted(files):
out_csv = OUTPUT_DIR / f"{infile.stem}_trimmed.csv"
try:
process_and_plot(infile, DESIRED_LEN, out_csv, FIG_DIR)
print(f"Processed {infile.name} -> {out_csv.name}; figures in {FIG_DIR}/")
except Exception as e:
print(f"Error processing {infile.name}: {e}")