2025-12-10 15:47:00 +02:00
5 changed files with 1534 additions and 18 deletions
--- a/config/app_config.yaml
+++ b/config/app_config.yaml
@@ -18,6 +18,8 @@ training:
  default_imgsz: 640
  default_patience: 50
  default_lr0: 0.01
+  last_dataset_yaml: /home/martin/code/object_detection/data/datasets/data.yaml
+  last_dataset_dir: /home/martin/code/object_detection/data/datasets
 detection:
  default_confidence: 0.25
  default_iou: 0.45
--- a/src/database/db_manager.py
+++ b/src/database/db_manager.py
@@ -10,6 +10,13 @@ from typing import List, Dict, Optional, Tuple, Any, Union
 from pathlib import Path
 import csv
 import hashlib
+import yaml
+
+from src.utils.logger import get_logger
+
+IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp")
+
+logger = get_logger(__name__)


 class DatabaseManager:
@@ -861,6 +868,187 @@ class DatabaseManager:
        finally:
            conn.close()

+    # ==================== Dataset Utilities ====================
+
+    def compose_data_yaml(
+        self,
+        dataset_root: str,
+        output_path: Optional[str] = None,
+        splits: Optional[Dict[str, str]] = None,
+    ) -> str:
+        """
+        Compose a YOLO data.yaml file based on dataset folders and database metadata.
+
+        Args:
+            dataset_root: Base directory containing the dataset structure.
+            output_path: Optional output path; defaults to <dataset_root>/data.yaml.
+            splits: Optional mapping overriding train/val/test image directories (relative
+                to dataset_root or absolute paths).
+
+        Returns:
+            Path to the generated YAML file.
+        """
+        dataset_root_path = Path(dataset_root).expanduser()
+        if not dataset_root_path.exists():
+            raise ValueError(f"Dataset root does not exist: {dataset_root_path}")
+        dataset_root_path = dataset_root_path.resolve()
+
+        split_map: Dict[str, str] = {key: "" for key in ("train", "val", "test")}
+        if splits:
+            for key, value in splits.items():
+                if key in split_map and value:
+                    split_map[key] = value
+
+        inferred = self._infer_split_dirs(dataset_root_path)
+        for key in split_map:
+            if not split_map[key]:
+                split_map[key] = inferred.get(key, "")
+
+        for required in ("train", "val"):
+            if not split_map[required]:
+                raise ValueError(
+                    "Unable to determine %s image directory under %s. Provide it "
+                    "explicitly via the 'splits' argument."
+                    % (required, dataset_root_path)
+                )
+
+        yaml_splits: Dict[str, str] = {}
+        for key, value in split_map.items():
+            if not value:
+                continue
+            yaml_splits[key] = self._normalize_split_value(value, dataset_root_path)
+
+        class_names = self._fetch_annotation_class_names()
+        if not class_names:
+            class_names = [cls["class_name"] for cls in self.get_object_classes()]
+        if not class_names:
+            raise ValueError("No object classes available to populate data.yaml")
+
+        names_map = {idx: name for idx, name in enumerate(class_names)}
+        payload: Dict[str, Any] = {
+            "path": dataset_root_path.as_posix(),
+            "train": yaml_splits["train"],
+            "val": yaml_splits["val"],
+            "names": names_map,
+            "nc": len(class_names),
+        }
+        if yaml_splits.get("test"):
+            payload["test"] = yaml_splits["test"]
+
+        output_path_obj = (
+            Path(output_path).expanduser()
+            if output_path
+            else dataset_root_path / "data.yaml"
+        )
+        output_path_obj.parent.mkdir(parents=True, exist_ok=True)
+
+        with open(output_path_obj, "w", encoding="utf-8") as handle:
+            yaml.safe_dump(payload, handle, sort_keys=False)
+
+        logger.info(f"Generated data.yaml at {output_path_obj}")
+        return output_path_obj.as_posix()
+
+    def _fetch_annotation_class_names(self) -> List[str]:
+        """Return class names referenced by annotations (ordered by class ID)."""
+        conn = self.get_connection()
+        try:
+            cursor = conn.cursor()
+            cursor.execute(
+                """
+                SELECT DISTINCT c.id, c.class_name
+                FROM annotations a
+                JOIN object_classes c ON a.class_id = c.id
+                ORDER BY c.id
+                """
+            )
+            rows = cursor.fetchall()
+            return [row["class_name"] for row in rows]
+        finally:
+            conn.close()
+
+    def _infer_split_dirs(self, dataset_root: Path) -> Dict[str, str]:
+        """Infer train/val/test image directories relative to dataset_root."""
+        patterns = {
+            "train": [
+                "train/images",
+                "training/images",
+                "images/train",
+                "images/training",
+                "train",
+                "training",
+            ],
+            "val": [
+                "val/images",
+                "validation/images",
+                "images/val",
+                "images/validation",
+                "val",
+                "validation",
+            ],
+            "test": [
+                "test/images",
+                "testing/images",
+                "images/test",
+                "images/testing",
+                "test",
+                "testing",
+            ],
+        }
+
+        inferred: Dict[str, str] = {key: "" for key in patterns}
+        for split_name, options in patterns.items():
+            for relative in options:
+                candidate = (dataset_root / relative).resolve()
+                if (
+                    candidate.exists()
+                    and candidate.is_dir()
+                    and self._directory_has_images(candidate)
+                ):
+                    try:
+                        inferred[split_name] = candidate.relative_to(
+                            dataset_root
+                        ).as_posix()
+                    except ValueError:
+                        inferred[split_name] = candidate.as_posix()
+                    break
+        return inferred
+
+    def _normalize_split_value(self, split_value: str, dataset_root: Path) -> str:
+        """Validate and normalize a split directory to a YAML-friendly string."""
+        split_path = Path(split_value).expanduser()
+        if not split_path.is_absolute():
+            split_path = (dataset_root / split_path).resolve()
+        else:
+            split_path = split_path.resolve()
+
+        if not split_path.exists() or not split_path.is_dir():
+            raise ValueError(f"Split directory not found: {split_path}")
+
+        if not self._directory_has_images(split_path):
+            raise ValueError(f"No images found under {split_path}")
+
+        try:
+            return split_path.relative_to(dataset_root).as_posix()
+        except ValueError:
+            return split_path.as_posix()
+
+    @staticmethod
+    def _directory_has_images(directory: Path, max_checks: int = 2000) -> bool:
+        """Return True if directory tree contains at least one image file."""
+        checked = 0
+        try:
+            for file_path in directory.rglob("*"):
+                if not file_path.is_file():
+                    continue
+                if file_path.suffix.lower() in IMAGE_EXTENSIONS:
+                    return True
+                checked += 1
+                if checked >= max_checks:
+                    break
+        except Exception:
+            return False
+        return False
+
    @staticmethod
    def calculate_checksum(file_path: str) -> str:
        """Calculate MD5 checksum of a file."""
--- a/src/gui/main_window.py
+++ b/src/gui/main_window.py
@@ -297,7 +297,9 @@ class MainWindow(QMainWindow):
            # Save window state before closing
            self._save_window_state()

-            # Save annotation tab state if it exists
+            # Persist tab state and stop background work before exit
+            if hasattr(self, "training_tab"):
+                self.training_tab.shutdown()
            if hasattr(self, "annotation_tab"):
                self.annotation_tab.save_state()

--- a/src/gui/tabs/training_tab.py
+++ b/src/gui/tabs/training_tab.py
--- a/src/model/yolo_wrapper.py
+++ b/src/model/yolo_wrapper.py
@@ -55,6 +55,7 @@ class YOLOWrapper:
        save_dir: str = "data/models",
        name: str = "custom_model",
        resume: bool = False,
+        callbacks: Optional[Dict[str, Callable]] = None,
        **kwargs,
    ) -> Dict[str, Any]:
        """
@@ -69,6 +70,7 @@ class YOLOWrapper:
            save_dir: Directory to save trained model
            name: Name for the training run
            resume: Resume training from last checkpoint
+            callbacks: Optional Ultralytics callback dictionary
            **kwargs: Additional training arguments

        Returns: