"""SHA-256 hash registry for the Lock Pipeline.

Usage
-----
Build / refresh the registry (records current state of all tracked files):
    python hash_registry.py

Verify current files against a previously saved registry:
    python hash_registry.py --verify

Tracked categories
------------------
  input   — *_7Metrics.xlsx  (58 source workbooks)
  script  — *.py             (all pipeline scripts)
    doc     — README.md, VALIDATION_REPORT.txt
    config  — requirements.txt, *.code-workspace
  output  — summary/**/*.txt and summary/**/*.json

The registry itself (registry.json) is always excluded from tracking to avoid
a self-referential hash chain.
"""
from __future__ import annotations

import argparse
import hashlib
import json
import sys
from pathlib import Path

_ROOT = Path(__file__).resolve().parent
_REGISTRY_FILE = _ROOT / "registry.json"
_SCHEMA_VERSION = "1.1"

_INPUT_GLOB = "*_7Metrics.xlsx"
_SCRIPT_GLOB = "*.py"
_DOC_FILES = ["README.md", "VALIDATION_REPORT.txt"]
_CONFIG_FILES = ["requirements.txt"]
_CONFIG_GLOBS = ["*.code-workspace"]
_OUTPUT_GLOBS = ["summary/**/*.txt", "summary/**/*.json"]
_EXCLUDED = {"registry.json"}


def _sha256(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as fh:
        for chunk in iter(lambda: fh.read(65_536), b""):
            h.update(chunk)
    return h.hexdigest()


def _collect(root: Path) -> dict[str, dict]:
    entries: dict[str, dict] = {}

    for path in sorted(root.glob(_INPUT_GLOB)):
        key = path.relative_to(root).as_posix()
        entries[key] = {
            "sha256": _sha256(path),
            "size_bytes": path.stat().st_size,
            "category": "input",
        }

    for path in sorted(root.glob(_SCRIPT_GLOB)):
        key = path.relative_to(root).as_posix()
        if key in _EXCLUDED:
            continue
        entries[key] = {
            "sha256": _sha256(path),
            "size_bytes": path.stat().st_size,
            "category": "script",
        }

    for name in _DOC_FILES:
        path = root / name
        if not path.exists():
            continue
        key = path.relative_to(root).as_posix()
        entries[key] = {
            "sha256": _sha256(path),
            "size_bytes": path.stat().st_size,
            "category": "doc",
        }

    for name in _CONFIG_FILES:
        path = root / name
        if not path.exists():
            continue
        key = path.relative_to(root).as_posix()
        entries[key] = {
            "sha256": _sha256(path),
            "size_bytes": path.stat().st_size,
            "category": "config",
        }

    for glob in _CONFIG_GLOBS:
        for path in sorted(root.glob(glob)):
            key = path.relative_to(root).as_posix()
            if key in _EXCLUDED:
                continue
            entries[key] = {
                "sha256": _sha256(path),
                "size_bytes": path.stat().st_size,
                "category": "config",
            }

    for glob in _OUTPUT_GLOBS:
        for path in sorted(root.glob(glob)):
            key = path.relative_to(root).as_posix()
            if key in _EXCLUDED:
                continue
            entries[key] = {
                "sha256": _sha256(path),
                "size_bytes": path.stat().st_size,
                "category": "output",
            }

    return entries


def cmd_build() -> None:
    from _pipeline_utils import _generated_timestamp

    entries = _collect(_ROOT)
    payload = {
        "schema_version": _SCHEMA_VERSION,
        "generated": _generated_timestamp(),
        "root": _ROOT.as_posix(),
        "entry_count": len(entries),
        "entries": entries,
    }
    _REGISTRY_FILE.write_text(
        json.dumps(payload, indent=2, ensure_ascii=True),
        encoding="utf-8",
    )

    by_category: dict[str, int] = {}
    for v in entries.values():
        cat = v["category"]
        by_category[cat] = by_category.get(cat, 0) + 1

    summary = ", ".join(f"{cat}={count}" for cat, count in sorted(by_category.items()))
    print(f"Registry written: {_REGISTRY_FILE.name}  ({len(entries)} entries: {summary})")


def cmd_verify() -> int:
    if not _REGISTRY_FILE.exists():
        print(
            "ERROR: registry.json not found. Run without --verify first to build a baseline.",
            file=sys.stderr,
        )
        return 2

    saved = json.loads(_REGISTRY_FILE.read_text(encoding="utf-8"))
    saved_entries: dict[str, dict] = saved.get("entries", {})
    current_entries = _collect(_ROOT)

    all_keys = sorted(set(saved_entries) | set(current_entries))

    ok = changed = added = removed = 0
    issues: list[str] = []

    for key in all_keys:
        if key not in saved_entries:
            added += 1
            issues.append(f"  ADDED   {key}")
        elif key not in current_entries:
            removed += 1
            issues.append(f"  REMOVED {key}")
        elif saved_entries[key]["sha256"] != current_entries[key]["sha256"]:
            changed += 1
            issues.append(f"  CHANGED {key}  (saved={saved_entries[key]['sha256'][:12]}...)")
        else:
            ok += 1

    if issues:
        print(
            f"Verification report  —  "
            f"{changed} changed, {added} added, {removed} removed, {ok} unchanged:"
        )
        for line in issues:
            print(line)
        return 1

    print(f"Verification PASSED — all {ok} entries match the registry.")
    return 0


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Lock Pipeline hash registry — build or verify SHA-256 digests."
    )
    parser.add_argument(
        "--verify",
        action="store_true",
        help="Compare current files against the saved registry.json instead of rebuilding it.",
    )
    args = parser.parse_args()

    if args.verify:
        sys.exit(cmd_verify())
    else:
        cmd_build()


if __name__ == "__main__":
    main()
