ArchiveBox Architecture Diagrams

High-Level System Execution Flow

stateDiagram-v2
    archivebox.cli.main(sys.argv)
    state Supervisord {
        Scheduler
        state Orchestrator {
            [*] --> TICK
            TICK --> SPAWN_ACTORS: queued > 0
            SPAWN_ACTORS --> TICK
            TICK --> IDLE: queued == 0
            IDLE --> TICK: 1s
        }
    }

    note left of archivebox.cli.main(sys.argv)
        archivebox entrypoint
    end note

    state "archivebox.cli.SUBCOMMAND" as MAIN_THREAD

    archivebox.cli.main(sys.argv) --> run_subcommand(sys.argv)
    run_subcommand(sys.argv) --> setup_django()
    setup_django() --> Supervisord: spawns in background
    setup_django() --> MAIN_THREAD: runs in foreground   

    MAIN_THREAD --> archivebox.main.SUBCOMMAND
    archivebox.main.SUBCOMMAND --> Storage: add_to_queue()

    state Actors {
        CrawlActor --> Crawl: tick()
        SnapshotActor --> Snapshot: tick()
        ArchiveResultActors --> ArchiveResult: tick()
    }

    state "State Machines" as JOBS {

        state Crawl {
            state "QUEUED" as CRAWL_QUEUED
            state "STARTED" as CRAWL_STARTED
            state "SEALED" as CRAWL_SEALED
            CRAWL_QUEUED --> CRAWL_STARTED: create_root_snapshot()
            CRAWL_STARTED --> CRAWL_SEALED: is_finished
        }

        state Snapshot {
            state "QUEUED" as SNAP_QUEUED
            state "STARTED" as SNAP_STARTED
            state "SEALED" as SNAP_SEALED
            SNAP_QUEUED --> SNAP_STARTED: create_pending_archiveresults()
            SNAP_STARTED --> SNAP_SEALED: is_finished
        }

        state ArchiveResult {
            QUEUED --> STARTED: run_extractor()
            STARTED --> BACKOFF: is_temp_error
            BACKOFF --> STARTED: is_retry_past
            STARTED --> FAILED: is_fatal_error
            STARTED --> SUCCEEDED: is_succeded
        }
        
        
        note right of ArchiveResult
            exec_crome()
        end note
        
        note right of ArchiveResult
            exec_wget()
        end note
        
        note right of ArchiveResult
            exec_curl()
        end note
        
        note right of ArchiveResult
            ... other extractor subprocesses ...
        end note
    }
    
    state Storage {
        state "DB" as SQLITE_DB
        sources/
        archive/
        state "index.json" as INDEX_JSONS
    }

    Storage: Storage

    Orchestrator --> Actors: spawns subprocesses
    
    Crawl --> Snapshot: create_root_snapshot()
    Snapshot --> ArchiveResult: create_pending_archiveresults()
    
    Crawl --> Storage: .save()
    Snapshot --> Storage: .save()
    ArchiveResult --> Storage: .save()
    
    Storage --> Actors: get_queue()


State Diagrams for Main Models

Crawl

  • crawls/models.py: Crawl

  • crawls/statemachines.py: CrawlMachine

Snapshot

  • core/models.py: Snapshot

  • core/statemachines.py: SnapshotMachine

ArchiveResult

  • core/models.py: ArchiveResult

  • core/statemachines.py: ArchiveResultMachine

image

Last updated