Spaces:

IBIBoW
/

FtsI_Classifier

Running

App Files Files Community

Muhamed-Kheir commited on Dec 17, 2025

Commit

121be4e

verified ·

1 Parent(s): 3d9381d

Upload 4 files

Browse files

Files changed (4) hide show

README.txt +18 -0
app.py +20 -0
kmer_unique.py +281 -0
requirements.txt +5 -0

README.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# Multi-group unique k-mer analysis
+This tool compares multiple groups of FASTA sequences (one directory per group) and identifies **k-mers unique to each group** relative to all other groups. It outputs per-group TSV files, a summary Excel file, and two plots.
+## Install
+pip install -r requirements.txt
+## Run
+python kmer_unique.py \
+  --group-dirs path/to/groupA path/to/groupB path/to/groupC \
+  --k-min 15 --k-max 31 --min-freq 5 \
+  --outdir results
+## Outputs
+- `results/unique_k{k}_{group}.tsv` : unique k-mers and counts
+- `results/kmer_summary.xlsx` : summary table across k
+- `results/unique_kmers_per_group.png`
+- `results/total_freq_per_group.png`

app.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import gradio as gr
+def show_instructions():
+    return (
+        "This Space hosts a command-line k-mer analysis script.\n\n"
+        "Run it locally like this:\n"
+        "python kmer_unique.py --group-dirs path/to/groupA path/to/groupB "
+        "--k-min 1 --k-max 50 --min-freq 5 --outdir kmer_results\n"
+    )
+demo = gr.Interface(
+    fn=show_instructions,
+    inputs=[],
+    outputs="text",
+    title="Unique k-mer Analysis (CLI tool)",
+    description="This Space is a Gradio wrapper so the repository builds. The main tool is kmer_unique.py."
+)
+if __name__ == "__main__":
+    demo.launch()

kmer_unique.py ADDED Viewed

	@@ -0,0 +1,281 @@

+#!/usr/bin/env python3
+"""
+Multi-group unique k-mer analysis.
+Given multiple directories (each directory = one group) containing FASTA files,
+computes k-mers unique to each group (relative to all other groups), filters by
+minimum frequency, writes per-group outputs, and saves a summary spreadsheet + plots.
+Example:
+  python kmer_unique.py \
+    --group-dirs data/groupA data/groupB data/groupC \
+    --k-min 15 --k-max 31 --min-freq 5 \
+    --outdir results
+"""
+from __future__ import annotations
+import argparse
+import os
+from collections import Counter
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Sequence, Tuple
+import matplotlib.pyplot as plt
+import pandas as pd
+from Bio import SeqIO
+# -------------------------------
+# Data structures
+# -------------------------------
+@dataclass(frozen=True)
+class GroupData:
+    name: str
+    dirpath: str
+    fasta_files: Tuple[str, ...]
+# -------------------------------
+# K-mer utilities
+# -------------------------------
+def get_kmers(sequence: str, k: int) -> List[str]:
+    """Extract all k-mers from a sequence, excluding those with 'N'."""
+    s = str(sequence).upper()
+    if k <= 0 or len(s) < k:
+        return []
+    return [s[i:i + k] for i in range(len(s) - k + 1) if "N" not in s[i:i + k]]
+def kmers_in_records(records: Iterable, k: int) -> set:
+    """Return the set of all k-mers present in a list of SeqRecords."""
+    out = set()
+    for rec in records:
+        out.update(get_kmers(str(rec.seq), k))
+    return out
+def count_kmers_in_records(records: Iterable, k: int) -> Counter:
+    """Count k-mers across all sequences in a list of SeqRecords."""
+    kmers: List[str] = []
+    for rec in records:
+        kmers.extend(get_kmers(str(rec.seq), k))
+    return Counter(kmers)
+# -------------------------------
+# IO helpers
+# -------------------------------
+FASTA_EXTS = (".fas", ".fasta", ".fa", ".fna")
+def find_fasta_files(directory: str) -> List[str]:
+    """Find all FASTA files in a directory."""
+    files = []
+    for fn in os.listdir(directory):
+        path = os.path.join(directory, fn)
+        if os.path.isfile(path) and fn.lower().endswith(FASTA_EXTS):
+            files.append(path)
+    files.sort()
+    return files
+def parse_fasta_files(files: Sequence[str]) -> List:
+    """Parse FASTA files into BioPython SeqRecord objects."""
+    records = []
+    for fp in files:
+        try:
+            records.extend(list(SeqIO.parse(fp, "fasta")))
+        except Exception as e:
+            print(f"⚠️  Warning: could not read {fp}: {e}")
+    return records
+def write_kmer_output(filepath: str, kmer_dict: Dict[str, int], total_freq: int) -> None:
+    """Write k-mers and their counts to a TSV file."""
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(f"# Total frequency of unique k-mers: {total_freq}\n")
+        f.write("kmer\tcount\n")
+        for kmer, count in sorted(kmer_dict.items()):
+            f.write(f"{kmer}\t{count}\n")
+# -------------------------------
+# Plotting
+# -------------------------------
+def plot_kmer_summary(summary_df: pd.DataFrame, output_dir: str) -> None:
+    """Generate plots showing unique k-mers and total frequencies per group."""
+    groups = [c.replace(" unique k-mers", "") for c in summary_df.columns if c.endswith(" unique k-mers")]
+    # Plot 1: Number of unique k-mers
+    plt.figure(figsize=(10, 6))
+    for g in groups:
+        plt.plot(summary_df["k"], summary_df[f"{g} unique k-mers"], marker="o", label=g)
+    plt.xlabel("k-mer size (k)")
+    plt.ylabel("Number of unique k-mers")
+    plt.title("Unique k-mers per group across k sizes")
+    plt.legend()
+    plt.grid(True)
+    plt.tight_layout()
+    plt.savefig(os.path.join(output_dir, "unique_kmers_per_group.png"), dpi=300)
+    plt.close()
+    # Plot 2: Total frequency of unique k-mers
+    plt.figure(figsize=(10, 6))
+    for g in groups:
+        plt.plot(summary_df["k"], summary_df[f"{g} total frequency"], marker="s", label=g)
+    plt.xlabel("k-mer size (k)")
+    plt.ylabel("Total frequency of unique k-mers")
+    plt.title("Total frequency of unique k-mers per group across k sizes")
+    plt.legend()
+    plt.grid(True)
+    plt.tight_layout()
+    plt.savefig(os.path.join(output_dir, "total_freq_per_group.png"), dpi=300)
+    plt.close()
+    print(f"✅ Plots saved to: {output_dir}")
+# -------------------------------
+# Core logic
+# -------------------------------
+def compute_unique_kmers_per_group(
+    group_records: Dict[str, List],
+    k: int,
+    min_freq: int,
+) -> Tuple[Dict[str, Dict[str, int]], Dict[str, int]]:
+    """
+    For each group, compute (unique kmers -> counts) after filtering by min_freq,
+    and total frequency of those unique kmers.
+    """
+    group_kmer_sets = {g: kmers_in_records(recs, k) for g, recs in group_records.items()}
+    group_counts = {g: count_kmers_in_records(recs, k) for g, recs in group_records.items()}
+    unique_counts: Dict[str, Dict[str, int]] = {}
+    total_freqs: Dict[str, int] = {}
+    group_names = list(group_records.keys())
+    for g in group_names:
+        unique = set(group_kmer_sets[g])
+        for other in group_names:
+            if other != g:
+                unique -= group_kmer_sets[other]
+        freq_unique = {km: group_counts[g][km] for km in unique if group_counts[g][km] >= min_freq}
+        unique_counts[g] = freq_unique
+        total_freqs[g] = sum(freq_unique.values())
+    return unique_counts, total_freqs
+def run(
+    group_dirs: Sequence[str],
+    k_min: int,
+    k_max: int,
+    min_freq: int,
+    output_dir: str,
+) -> None:
+    os.makedirs(output_dir, exist_ok=True)
+    # Validate and collect groups
+    groups: List[GroupData] = []
+    for d in group_dirs:
+        if not os.path.exists(d):
+            raise FileNotFoundError(f"Group directory not found: {d}")
+        if not os.path.isdir(d):
+            raise NotADirectoryError(f"Not a directory: {d}")
+        name = os.path.basename(d.rstrip(os.sep))
+        fasta_files = tuple(find_fasta_files(d))
+        if not fasta_files:
+            raise FileNotFoundError(f"No FASTA files found in: {d}")
+        groups.append(GroupData(name=name, dirpath=d, fasta_files=fasta_files))
+    print("📂 Groups and file counts:")
+    for g in groups:
+        print(f"  {g.name}: {len(g.fasta_files)} files")
+    summary_rows: List[dict] = []
+    for k in range(k_min, k_max + 1):
+        print(f"\n🔹 Processing k = {k}")
+        # Parse records once per k (same as your original logic)
+        group_records = {g.name: parse_fasta_files(g.fasta_files) for g in groups}
+        unique_counts, total_freqs = compute_unique_kmers_per_group(
+            group_records=group_records,
+            k=k,
+            min_freq=min_freq,
+        )
+        row = {"k": k}
+        for g in groups:
+            counts = unique_counts[g.name]
+            total_freq = total_freqs[g.name]
+            print(f"  Group {g.name}: {len(counts)} unique {k}-mers | Total freq: {total_freq}")
+            out_path = os.path.join(output_dir, f"unique_k{k}_{g.name}.tsv")
+            write_kmer_output(out_path, counts, total_freq)
+            row[f"{g.name} unique k-mers"] = len(counts)
+            row[f"{g.name} total frequency"] = total_freq
+        summary_rows.append(row)
+    summary_df = pd.DataFrame(summary_rows)
+    summary_path = os.path.join(output_dir, "kmer_summary.xlsx")
+    summary_df.to_excel(summary_path, index=False)
+    print(f"\n✅ Summary saved to: {summary_path}")
+    plot_kmer_summary(summary_df, output_dir)
+# -------------------------------
+# CLI
+# -------------------------------
+def build_arg_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Compute group-specific unique k-mers from FASTA directories.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument(
+        "--group-dirs",
+        nargs="+",
+        required=True,
+        help="List of directories, one per group, containing FASTA files.",
+    )
+    p.add_argument("--k-min", type=int, default=1, help="Minimum k-mer size.")
+    p.add_argument("--k-max", type=int, default=50, help="Maximum k-mer size.")
+    p.add_argument("--min-freq", type=int, default=5, help="Minimum frequency threshold for unique k-mers.")
+    p.add_argument("--outdir", type=str, default="kmer_results", help="Output directory.")
+    return p
+def main() -> None:
+    args = build_arg_parser().parse_args()
+    if args.k_min < 1:
+        raise ValueError("--k-min must be >= 1")
+    if args.k_max < args.k_min:
+        raise ValueError("--k-max must be >= --k-min")
+    if args.min_freq < 1:
+        raise ValueError("--min-freq must be >= 1")
+    run(
+        group_dirs=args.group_dirs,
+        k_min=args.k_min,
+        k_max=args.k_max,
+        min_freq=args.min_freq,
+        output_dir=args.outdir,
+    )
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+biopython
+pandas
+matplotlib
+openpyxl