{-# LANGUAGE ScopedTypeVariables #-}

module Ouroboros.Consensus.Node.RethrowPolicy (consensusRethrowPolicy) where

import Control.Monad.Class.MonadAsync (ExceptionInLinkedThread (..))
import Control.ResourceRegistry
  ( RegistryClosedException
  , ResourceRegistryThreadException
  , TempRegistryException
  )
import Data.Proxy (Proxy)
import Data.Typeable (Typeable)
import Ouroboros.Consensus.Block (StandardHash)
import Ouroboros.Consensus.BlockchainTime
import Ouroboros.Consensus.MiniProtocol.BlockFetch.Server
  ( BlockFetchServerException
  )
import Ouroboros.Consensus.MiniProtocol.ChainSync.Client
  ( ChainSyncClientException
  )
import Ouroboros.Consensus.Node.DbLock
import Ouroboros.Consensus.Node.DbMarker (DbMarkerError)
import Ouroboros.Consensus.Storage.ChainDB.API
  ( ChainDbError (..)
  , ChainDbFailure
  )
import Ouroboros.Consensus.Storage.ImmutableDB.API (ImmutableDBError)
import qualified Ouroboros.Consensus.Storage.ImmutableDB.API as ImmutableDB
import Ouroboros.Consensus.Storage.VolatileDB.API (VolatileDBError)
import qualified Ouroboros.Consensus.Storage.VolatileDB.API as VolatileDB
import Ouroboros.Network.RethrowPolicy
import System.FS.API.Types (FsError)

-- Exception raised during interaction with the peer
--
-- The list below should contain an entry for every type declared as an
-- instance of 'Exception' within ouroboros-consensus.
--
-- If a particular exception is not handled by any policy, a default
-- kicks in, which currently means logging the exception and disconnecting
-- from the peer (in both directions), but allowing a reconnect within a saall
-- delay (10-20s). This is fine for exceptions that only affect that peer.  It
-- is however essential that we handle exceptions here that /must/ shut down the
-- node (mainly storage layer errors).
--
-- TODO: Talk to devops about what they should do when the node does
-- terminate with a storage layer exception (restart with full recovery).
consensusRethrowPolicy ::
  forall blk.
  (Typeable blk, StandardHash blk) =>
  Proxy blk ->
  RethrowPolicy
consensusRethrowPolicy :: forall blk.
(Typeable blk, StandardHash blk) =>
Proxy blk -> RethrowPolicy
consensusRethrowPolicy Proxy blk
pb =
  (ErrorContext -> DbMarkerError -> ErrorCommand) -> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (DbMarkerError
_ :: DbMarkerError) -> ErrorCommand
shutdownNode)
    -- Any exceptions in the storage layer should terminate the node
    --
    -- NOTE: We do not catch IOExceptions here; they /ought/ to be caught
    -- by the FS layer (and turn into FsError). If we do want to catch
    -- them, we'd somehow have to distinguish between IO exceptions
    -- arising from disk I/O (shutdownNode) and those arising from
    -- network failures (SuspendConsumer).
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> DbMarkerError -> ErrorCommand) -> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (DbMarkerError
_ :: DbMarkerError) -> ErrorCommand
shutdownNode)
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> DbLocked -> ErrorCommand) -> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (DbLocked
_ :: DbLocked) -> ErrorCommand
shutdownNode)
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> ChainDbFailure blk -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (ChainDbFailure blk
_ :: ChainDbFailure blk) -> ErrorCommand
shutdownNode)
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> VolatileDBError blk -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy
      ( \ErrorContext
_ctx (VolatileDBError blk
e :: VolatileDBError blk) ->
          case VolatileDBError blk
e of
            VolatileDB.ApiMisuse{} -> ErrorCommand
ourBug
            VolatileDB.UnexpectedFailure{} -> ErrorCommand
shutdownNode
      )
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> ImmutableDBError blk -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy
      ( \ErrorContext
_ctx (ImmutableDBError blk
e :: ImmutableDBError blk) ->
          case ImmutableDBError blk
e of
            ImmutableDB.ApiMisuse{} -> ErrorCommand
ourBug
            ImmutableDB.UnexpectedFailure{} -> ErrorCommand
shutdownNode
      )
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> FsError -> ErrorCommand) -> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (FsError
_ :: FsError) -> ErrorCommand
shutdownNode)
    -- When the system clock moved back, we have to restart the node.
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> SystemClockMovedBackException -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (SystemClockMovedBackException
_ :: SystemClockMovedBackException) -> ErrorCommand
shutdownNode)
    -- Some chain DB errors are indicative of a bug in our code, others
    -- indicate an invalid request from the peer. If the DB is closed
    -- entirely, it will only be reopened after a node restart.
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> ChainDbError blk -> ErrorCommand) -> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy
      ( \ErrorContext
_ctx (ChainDbError blk
e :: ChainDbError blk) ->
          case ChainDbError blk
e of
            ClosedDBError{} -> ErrorCommand
shutdownNode
            ClosedFollowerError{} -> ErrorCommand
ourBug
            InvalidIteratorRange{} -> ErrorCommand
theyBuggyOrEvil
      )
    -- We have some resource registries that are used per-connection,
    -- and so if we have ResourceRegistry related exception, we close
    -- the connection but leave the rest of the node running.
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> RegistryClosedException -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (RegistryClosedException
_ :: RegistryClosedException) -> ErrorCommand
ourBug)
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> ResourceRegistryThreadException -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (ResourceRegistryThreadException
_ :: ResourceRegistryThreadException) -> ErrorCommand
ourBug)
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> TempRegistryException -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (TempRegistryException
_ :: TempRegistryException) -> ErrorCommand
ourBug)
    -- An exception in the block fetch server meant the client asked
    -- for some blocks we used to have but got GCed. This means the
    -- peer is on a chain that forks off more than @k@ blocks away.
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> BlockFetchServerException -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (BlockFetchServerException
_ :: BlockFetchServerException) -> ErrorCommand
distantPeer)
    -- Some chain sync client exceptions indicate malicious behaviour,
    -- others merely mean that we should disconnect from this client
    -- because we have diverged too much.
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> ChainSyncClientException -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy (\ErrorContext
_ctx (ChainSyncClientException
_ :: ChainSyncClientException) -> ErrorCommand
theyBuggyOrEvil)
    -- Dispatch on nested exception
    RethrowPolicy -> RethrowPolicy -> RethrowPolicy
forall a. Semigroup a => a -> a -> a
<> (ErrorContext -> ExceptionInLinkedThread -> ErrorCommand)
-> RethrowPolicy
forall e.
Exception e =>
(ErrorContext -> e -> ErrorCommand) -> RethrowPolicy
mkRethrowPolicy
      ( \ErrorContext
ctx (ExceptionInLinkedThread String
_ SomeException
e) ->
          RethrowPolicy -> ErrorContext -> SomeException -> ErrorCommand
runRethrowPolicy (Proxy blk -> RethrowPolicy
forall blk.
(Typeable blk, StandardHash blk) =>
Proxy blk -> RethrowPolicy
consensusRethrowPolicy Proxy blk
pb) ErrorContext
ctx SomeException
e
      )
 where
  -- Shutdown the node. If we have a storage layer failure, the node /must/
  -- be restarted (triggering recovery).
  shutdownNode :: ErrorCommand
  shutdownNode :: ErrorCommand
shutdownNode = ErrorCommand
ShutdownNode

  -- Peer is either on a distant chain (one that forks more than k blocks ago)
  -- or else is just too far behind; the chain sync client doesn't really have
  -- any way of distinguishing between these two cases. If they are merely
  -- far behind, we might want to reconnect to them later.
  distantPeer :: ErrorCommand
  distantPeer :: ErrorCommand
distantPeer = ErrorCommand
ShutdownPeer

  -- The peer sent us some data that they could have known was invalid.
  -- This can only be due to a bug or malice.
  theyBuggyOrEvil :: ErrorCommand
  theyBuggyOrEvil :: ErrorCommand
theyBuggyOrEvil = ErrorCommand
ShutdownPeer

  -- Something went wrong due to a bug in our code. We disconnect from the
  -- peer, but allow to try again later in the hope the bug was transient.
  ourBug :: ErrorCommand
  ourBug :: ErrorCommand
ourBug = ErrorCommand
ShutdownPeer