From b06a65e1338268aa8dc07377eb67723ea4b856c9 Mon Sep 17 00:00:00 2001 From: Ryth Azhur Date: Mon, 6 Apr 2026 22:50:05 -0400 Subject: [PATCH] Fix critical cache issues: user_id scoping, AND purge, error handling, TTL Fixes from 8-expert review: - Pass user_id from request.user.pk to cache key derivation (data leak fix) - Scoped purge uses AND (intersection) not OR (union) semantics - All cache ops in executor wrapped in try/except with logging fallthrough - Thread-safe cache initialization with threading.Lock - RedisCache: 24h safety-net TTL, connection timeouts, MULTI/EXEC pipeline - RedisCache.clear() uses pipelined UNLINK instead of per-batch DELETE - build_index_keys now stringifies values matching derive_cache_key - get_cache() logs warnings for partial config and connection failures - Wire-protocol internals removed from __all__ Remaining open: purge atomicity (Lua script), cross-language str() canon, broad purge sub-index cleanup, thundering herd protection, RedisCache tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/mizan/cache/KNOWN_ISSUES.md | 91 +++++++------------ .../mizan-django/src/mizan/cache/__init__.py | 64 +++++++++---- .../mizan-django/src/mizan/cache/backend.py | 27 +++++- packages/mizan-django/src/mizan/cache/keys.py | 2 +- .../mizan-django/src/mizan/client/executor.py | 53 +++++++---- 5 files changed, 140 insertions(+), 97 deletions(-) diff --git a/packages/mizan-django/src/mizan/cache/KNOWN_ISSUES.md b/packages/mizan-django/src/mizan/cache/KNOWN_ISSUES.md index 396ecb4..40b90fc 100644 --- a/packages/mizan-django/src/mizan/cache/KNOWN_ISSUES.md +++ b/packages/mizan-django/src/mizan/cache/KNOWN_ISSUES.md @@ -1,88 +1,65 @@ # Cache Module — Known Issues -Issues identified by 8-domain-expert review of the initial implementation. -Fix in priority order before shipping. +Issues identified by 8-domain-expert review. Status tracked here. ## Critical (Security / Data Corruption) -### 1. User-scoped content cached without user_id -`context_fetch_view` never passes `user_id` to `cache_get`/`cache_put`. -Authenticated User A's response gets cached and served to User B. -**Fix:** Extract user_id from request (MWT `sub` claim when available, -`request.user.pk` as interim) and pass to cache operations. +### 1. ~~User-scoped content cached without user_id~~ FIXED +`context_fetch_view` now extracts `user_id` from `request.user.pk` and +passes it to `cache_get`/`cache_put`. ### 2. Purge race condition (non-atomic index operations) -`cache_purge` does `get_index` -> `delete_index` -> `delete_many` as -separate operations. Concurrent `cache_put` between steps can orphan -entries or lose purges. -**Fix:** Use Redis Lua script or WATCH/MULTI for atomic purge. -MemoryCache should use a threading lock. +`cache_purge` does index reads and deletes as separate operations. +Concurrent `cache_put` between steps can orphan entries. +**Status:** Partially mitigated by AND semantics fix. Full atomicity +(Lua script or WATCH/MULTI) still needed for Redis backend. -### 3. No Redis error handling -Any Redis failure throws `ConnectionError` into the request path -> 500. -No try/except, no fallback, no circuit breaker. -**Fix:** Wrap all Redis calls in try/except. On failure, fall through -to uncached execution (cache miss behavior). +### 3. ~~No Redis error handling~~ FIXED +All cache operations in `executor.py` wrapped in try/except with +`logger.warning`. Redis failure falls through to uncached execution. -### 4. Scoped purge uses OR semantics, should use AND -`cache_purge({user_id: 5, org_id: 3})` deletes everything with -`user_id=5` OR `org_id=3`. Should intersect index lookups. -**Fix:** Change `keys_to_delete.update()` to set intersection. +### 4. ~~Scoped purge uses OR semantics~~ FIXED +Changed to AND (intersection). `{user_id: 5, org_id: 3}` now only +deletes entries matching BOTH params. ## High (Correctness / Operability) -### 5. No TTL on Redis entries -If purge fails or `affects` is misconfigured, stale data persists forever. -**Fix:** Add safety-net TTL to `RedisCache.put` (e.g., 24h default). +### 5. ~~No TTL on Redis entries~~ FIXED +`RedisCache.put` now sets `ex=86400` (24h safety-net TTL) by default. ### 6. Cross-language str() vs String() divergence Python `str(True)` -> `"True"`, JS `String(true)` -> `"true"`. -Python `str(None)` -> `"None"`, JS `String(null)` -> `"null"`. -Cache keys will mismatch between Python and TypeScript adapters. -**Fix:** Define canonical stringification rules in the protocol spec. -Normalize booleans to "true"/"false", null to "null", numbers to -consistent format before stringification. +**Status:** Open. Needs canonical stringification rules in protocol spec. ### 7. Broad purge doesn't clean per-param sub-indexes -After broad purge of `mizan:idx:user`, the per-param indexes -(`mizan:idx:user:user_id=5`) remain as dangling sets. Slow memory leak. -**Fix:** On broad purge, also scan and delete `mizan:idx:{context}:*` indexes. +**Status:** Open. Slow memory leak in Redis. -### 8. build_index_keys doesn't stringify values -`derive_cache_key` calls `str(v)` but `build_index_keys` uses raw `v`. -Latent inconsistency for non-string types. -**Fix:** Stringify values in `build_index_keys` too. +### 8. ~~build_index_keys doesn't stringify values~~ FIXED +Now calls `str(v)` on all values, matching `derive_cache_key`. -### 9. Silent exception swallowing in get_cache() -Misconfigured Redis URL or missing secret produces no log, no warning. -**Fix:** Log warning on partial config and connection failure. +### 9. ~~Silent exception swallowing in get_cache()~~ FIXED +Now logs warnings for partial config and connection failures. -### 10. _initialized flag not thread-safe -Two concurrent workers calling `get_cache()` race on globals. -**Fix:** Use `threading.Lock` or resolve at `AppConfig.ready()`. +### 10. ~~_initialized flag not thread-safe~~ FIXED +Now uses `threading.Lock` for thread-safe initialization. ## Medium (Design / Performance) ### 11. No thundering-herd protection -Concurrent cold misses all execute and write. Origin cache should -deduplicate in-flight requests (request coalescing). +**Status:** Open. Concurrent cold misses all execute and write. -### 12. Wire-protocol internals in __all__ -`derive_cache_key` and `build_index_keys` are promoted to public API. -Changing key format requires semver major bump. -**Fix:** Remove from `__all__`, prefix with `_` or move to internal module. +### 12. ~~Wire-protocol internals in __all__~~ FIXED +`derive_cache_key` and `build_index_keys` removed from `__all__`. ### 13. Inconsistent API pattern -`cache_get`/`cache_put` take explicit `secret`+`backend` args but -executor fetches these from globals. Pick one pattern. +**Status:** Open. `cache_get`/`cache_put` take explicit args but executor +fetches from globals. -### 14. clear() uses SCAN + DELETE without pipeline -O(N) round trips for large caches. -**Fix:** Pipeline the deletes. +### 14. ~~clear() uses SCAN + DELETE without pipeline~~ FIXED +Now uses pipeline with UNLINK for batched async deletes. -### 15. No Redis connection timeouts -`from_url()` has no `socket_connect_timeout`, `socket_timeout`, or -`health_check_interval`. +### 15. ~~No Redis connection timeouts~~ FIXED +`socket_connect_timeout=5`, `socket_timeout=5`, `health_check_interval=30`. ### 16. No RedisCache test coverage -Only MemoryCache is tested. Use `fakeredis` for RedisCache tests. +**Status:** Open. Only MemoryCache is tested. diff --git a/packages/mizan-django/src/mizan/cache/__init__.py b/packages/mizan-django/src/mizan/cache/__init__.py index f9dfc47..2f5292b 100644 --- a/packages/mizan-django/src/mizan/cache/__init__.py +++ b/packages/mizan-django/src/mizan/cache/__init__.py @@ -20,13 +20,18 @@ Configuration (Django settings): from __future__ import annotations +import logging +import threading from typing import Any from .backend import CacheBackend, MemoryCache, RedisCache from .keys import derive_cache_key, build_index_keys +logger = logging.getLogger("mizan.cache") + _cache_instance: CacheBackend | None = None _initialized = False +_init_lock = threading.Lock() def get_cache() -> CacheBackend | None: @@ -35,20 +40,37 @@ def get_cache() -> CacheBackend | None: Returns RedisCache if MIZAN_CACHE_SECRET and MIZAN_CACHE_REDIS_URL are both set. Returns None otherwise. The instance is cached for the process - lifetime. + lifetime. Thread-safe. """ global _cache_instance, _initialized if _initialized: return _cache_instance - _initialized = True - try: - from mizan.setup.settings import get_settings - settings = get_settings() - if settings.cache_secret and settings.cache_redis_url: - _cache_instance = RedisCache(settings.cache_redis_url) - except Exception: - _cache_instance = None + with _init_lock: + if _initialized: + return _cache_instance + + _initialized = True + try: + from mizan.setup.settings import get_settings + settings = get_settings() + + if settings.cache_secret and settings.cache_redis_url: + _cache_instance = RedisCache(settings.cache_redis_url) + logger.info("Mizan cache enabled (Redis: %s)", settings.cache_redis_url) + elif settings.cache_secret and not settings.cache_redis_url: + logger.warning( + "MIZAN_CACHE_SECRET is set but MIZAN_CACHE_REDIS_URL is missing. " + "Cache is disabled." + ) + elif settings.cache_redis_url and not settings.cache_secret: + logger.warning( + "MIZAN_CACHE_REDIS_URL is set but MIZAN_CACHE_SECRET is missing. " + "Cache is disabled." + ) + except Exception: + logger.warning("Failed to initialize Mizan cache", exc_info=True) + _cache_instance = None return _cache_instance @@ -120,14 +142,26 @@ def cache_purge( Returns the number of entries purged. """ if params: - # Scoped purge — find entries matching each param via index - keys_to_delete: set[str] = set() + # Scoped purge — intersect index lookups (AND semantics) + # Entry must match ALL params, not just any one. + sets_per_param: list[set[str]] = [] + param_index_keys: list[str] = [] for k, v in sorted(params.items()): - index_key = f"mizan:idx:{context}:{k}={v}" - keys_to_delete.update(backend.get_index(index_key)) - backend.delete_index(index_key) + index_key = f"mizan:idx:{context}:{k}={str(v)}" + param_index_keys.append(index_key) + sets_per_param.append(backend.get_index(index_key)) + + if sets_per_param: + keys_to_delete = sets_per_param[0] + for s in sets_per_param[1:]: + keys_to_delete = keys_to_delete & s + else: + keys_to_delete = set() if keys_to_delete: + # Clean up per-param indexes + for idx_key in param_index_keys: + backend.remove_from_index(idx_key, keys_to_delete) # Remove from the broad context index too backend.remove_from_index(f"mizan:idx:{context}", keys_to_delete) return backend.delete_many(list(keys_to_delete)) @@ -153,6 +187,4 @@ __all__ = [ "cache_get", "cache_put", "cache_purge", - "derive_cache_key", - "build_index_keys", ] diff --git a/packages/mizan-django/src/mizan/cache/backend.py b/packages/mizan-django/src/mizan/cache/backend.py index 782a077..9b097ae 100644 --- a/packages/mizan-django/src/mizan/cache/backend.py +++ b/packages/mizan-django/src/mizan/cache/backend.py @@ -78,7 +78,15 @@ class RedisCache: Requires `redis-py` (pip install mizan[cache]). """ - def __init__(self, redis_url: str, prefix: str = "mizan:cache:") -> None: + # Safety-net TTL: entries expire even if purge fails (24 hours) + DEFAULT_TTL = 86400 + + def __init__( + self, + redis_url: str, + prefix: str = "mizan:cache:", + ttl: int | None = None, + ) -> None: try: import redis except ImportError: @@ -86,8 +94,14 @@ class RedisCache: "Redis is required for Mizan's cache backend. " "Install it with: pip install mizan[cache]" ) - self._client = redis.from_url(redis_url) + self._client = redis.from_url( + redis_url, + socket_connect_timeout=5, + socket_timeout=5, + health_check_interval=30, + ) self._prefix = prefix + self._ttl = ttl if ttl is not None else self.DEFAULT_TTL def _key(self, key: str) -> str: return f"{self._prefix}{key}" @@ -98,8 +112,8 @@ class RedisCache: def put(self, key: str, value: bytes, indexes: list[str]) -> None: prefixed_key = self._key(key) - pipe = self._client.pipeline() - pipe.set(prefixed_key, value) + pipe = self._client.pipeline(transaction=True) + pipe.set(prefixed_key, value, ex=self._ttl) for idx in indexes: pipe.sadd(self._key(idx), key) pipe.execute() @@ -127,6 +141,9 @@ class RedisCache: while True: cursor, keys = self._client.scan(cursor, match=pattern, count=100) if keys: - self._client.delete(*keys) + pipe = self._client.pipeline() + for key in keys: + pipe.unlink(key) + pipe.execute() if cursor == 0: break diff --git a/packages/mizan-django/src/mizan/cache/keys.py b/packages/mizan-django/src/mizan/cache/keys.py index cf5a161..537bdce 100644 --- a/packages/mizan-django/src/mizan/cache/keys.py +++ b/packages/mizan-django/src/mizan/cache/keys.py @@ -75,5 +75,5 @@ def build_index_keys(context: str, params: dict[str, Any]) -> list[str]: """ keys = [f"mizan:idx:{context}"] for k, v in sorted(params.items()): - keys.append(f"mizan:idx:{context}:{k}={v}") + keys.append(f"mizan:idx:{context}:{k}={str(v)}") return keys diff --git a/packages/mizan-django/src/mizan/client/executor.py b/packages/mizan-django/src/mizan/client/executor.py index 0ff23b4..1ec31ba 100644 --- a/packages/mizan-django/src/mizan/client/executor.py +++ b/packages/mizan-django/src/mizan/client/executor.py @@ -662,14 +662,19 @@ def function_call_view(request: HttpRequest) -> JsonResponse: response["X-Mizan-Invalidate"] = _format_invalidate_header(invalidate_contexts) # Purge origin-side cache for invalidated contexts + import logging from mizan.cache import get_cache, cache_purge + _cache_log = logging.getLogger("mizan.cache") cache = get_cache() if cache is not None: - for entry in invalidate_contexts: - if isinstance(entry, str): - cache_purge(cache, entry) - elif isinstance(entry, dict): - cache_purge(cache, entry["context"], entry.get("params")) + try: + for entry in invalidate_contexts: + if isinstance(entry, str): + cache_purge(cache, entry) + elif isinstance(entry, dict): + cache_purge(cache, entry["context"], entry.get("params")) + except Exception: + _cache_log.warning("Cache purge failed", exc_info=True) return response @@ -771,19 +776,28 @@ def context_fetch_view(request: HttpRequest, context_name: str) -> JsonResponse: params = request.GET.dict() # Origin-side cache lookup + import logging from mizan.cache import get_cache, cache_get, cache_put from mizan.setup.settings import get_settings + _cache_log = logging.getLogger("mizan.cache") cache = get_cache() cache_settings = get_settings() + user_id = None + if hasattr(request, "user") and hasattr(request.user, "pk") and request.user.pk: + user_id = str(request.user.pk) if cache is not None and cache_settings.cache_secret: - cached = cache_get( - cache_settings.cache_secret, cache, context_name, params, - ) - if cached is not None: - response = HttpResponse(cached, content_type="application/json") - response["Cache-Control"] = "public, max-age=0, s-maxage=31536000" - response["X-Mizan-Cache"] = "HIT" - return response + try: + cached = cache_get( + cache_settings.cache_secret, cache, context_name, params, + user_id=user_id, + ) + if cached is not None: + response = HttpResponse(cached, content_type="application/json") + response["Cache-Control"] = "public, max-age=0, s-maxage=31536000" + response["X-Mizan-Cache"] = "HIT" + return response + except Exception: + _cache_log.warning("Cache lookup failed, falling through", exc_info=True) result = execute_context(request, context_name, params) @@ -813,10 +827,13 @@ def context_fetch_view(request: HttpRequest, context_name: str) -> JsonResponse: # Store in origin-side cache if cache is not None and cache_settings.cache_secret: - cache_put( - cache_settings.cache_secret, cache, context_name, params, - response.content, - ) - response["X-Mizan-Cache"] = "MISS" + try: + cache_put( + cache_settings.cache_secret, cache, context_name, params, + response.content, user_id=user_id, + ) + response["X-Mizan-Cache"] = "MISS" + except Exception: + _cache_log.warning("Cache store failed", exc_info=True) return response