Configuration Reference#

All pipeline behaviour is controlled by a single YAML file. Start from an example in configs-example/ and edit it to suit your deployment.

Environment variables can be used anywhere in the config with ${VAR} or ${VAR:-default} syntax — they are resolved at load time. Never commit secrets directly to config files.

pipeline#

pipeline:
  name: "dengue"           # used as a prefix in logs and artifact paths
  title: "Dengue Intelligence"  # used in report titles and email subjects

run#

run:
  run_date: "2026-04-07"   # reference date for this run (empty = today)

storages#

Where pipeline artifacts are written. Supports filesystem and S3 backends.

storages:
  artifacts:
    kind: filesystem
    filesystem:
      base_path: "./artifacts"   # outputs go under {base_path}/{run_id}/

# S3 alternative:
storages:
  artifacts:
    kind: s3
    s3:
      bucket: my-bucket
      base_prefix: "artifacts/"
      profile: default
      region: ap-south-1

data#

case_download#

Controls how raw case data is fetched.

data:
  case_download:
    enabled: true
    source_backend: "filesystem"   # "filesystem" or "s3"
    source_path: "datasets/raw_linelist_data/KA_linelist"
    cache_enabled: true
    cache_dir: "./cache/raw_case"
    cache_strategy: "local_first"  # "local_first" | "local" | "cloud_first"

case_parse#

Controls how the raw case data is parsed into daily case series.

data:
  case_parse:
    region_types: ["district"]     # ["district"] or ["zone", "corp"] etc.
    date_start: "2015-01-01"
    date_end: ""                   # empty = run_date

case_sufficiency#

Early validation gate. The pipeline exits here if data does not meet the minimum requirements.

data:
  case_sufficiency:
    enabled: true
    min_total_rows: 30
    min_distinct_regions: 2
    min_date_span_days: 14

geojson#

Path to the folder containing GeoJSON boundary files.

data:
  geojson:
    base_path: "datasets/geojsons/geojsons_GBA"

GeoJSON files must follow the layout {base_path}/{region_type}s/{region_id}.geojson. See Input Data Specification for the full GeoJSON format.

weather_download#

Controls how weather data is fetched. Use source_mode: "filesystem" to read from a local cache, or "cds" to download fresh data from the Copernicus CDS API.

data:
  weather_download:
    enabled: true
    source_mode: "filesystem"      # "filesystem" or "cds"
    netcdf_cache_path: "./datasets/netcdf"
    parsed_output_path: "./datasets/parsednetcdf"
    region_type: "district"
    w_params: ["t2m", "d2m", "tp"]
    threshold_km: 25.0
    bounds_resolution_deg: 0.1
    start_date: "2015-01-01"
    end_date: "2024-07-01"

weather_parse#

Aggregation rules applied to the raw weather variables.

data:
  weather_parse:
    region_type: "district"
    weather_variables: ["2mTemperature", "totalPrecipitation", "2mDewpointTemperature"]
    daily_agg:
      - name: "2mTemperature"
        op: "mean"
      - name: "2mTemperature"
        op: "max"
        output_name: "2mTemperature_max"
    rolling_agg:
      - name: "2mTemperature"
        op: "mean"
    rolling_n_days: 7
    sampling_rate: 7
    intermediate_col_rename:
      "2mTemperature": "t2m_mean"
      "totalPrecipitation": "tp_sum"
      "2mDewpointTemperature": "d2m_mean"

cutoff#

Minimum number of regions required in the case and weather datasets after cutoff filtering. Runs with fewer regions than these thresholds are aborted.

cutoff:
  case_min_regions: 10
  weather_min_regions: 25

thresholds#

thresholds:
  region_type: "district"
  n_weeks: 4                 # number of recent weeks used for the "previousNweeks" method
  historical_n_years: 5      # years of history used for the "historical" method
  excluded_years: [2020, 2021]  # years excluded from threshold calculation (e.g. pandemic years)

model#

model:
  spatial_res: "district"
  data_features:
    - "case"
    - "recordDate"
    - "recordYear"
    - "recordMonth"
    - "ISOWeek"
    - "t2m_mean"
    - "tp_sum"
    - "d2m_mean"
  lag:
    lag_temp: [12]           # temperature lag in days
    lag_rf: [4]              # rainfall / humidity lag in days
  years_to_exclude: [2020, 2021]
  list_alpha: [1.0, 2.0]    # NBR alpha parameters

assess#

assess:
  total_corp_regions: 5
  total_zone_regions: 10

maps#

maps:
  output_dir: "plots"
  figure_title: "Dengue Risk Map"

report#

report:
  output_dir: "reports"
  compile_pdf: false         # set true if pdflatex is installed

report_distribution#

Metadata embedded in the generated reports.

report_distribution:
  system_name: "Dengue Early Warning System"
  organization: "ARTPARK, IISc Bengaluru"
  state: "Karnataka"
  region: "Greater Bengaluru Area (GBA)"
  department: "Directorate of Health & Family Welfare, GoK"

email#

Run completion notifications via SMTP.

email:
  enabled: true
  on: ["success"]            # "success", "failed", or both
  from: "${SMTP_FROM}"
  to:
    - "user@example.com"
  smtp:
    host: "${SMTP_HOST}"
    port: 587
    use_tls: true
    username: "${SMTP_USERNAME}"
    password: "${SMTP_PASSWORD}"

schedule#

Used by scripts/run_schedules.py to configure recurring runs.

schedule:
  cron: "0 6 * * 1"         # every Monday at 06:00 UTC
  timezone: "Asia/Kolkata"
  pipeline: "pipelines.dengue.pipeline:build_pipeline"
  config: "configs/gba_stage1.yaml"

logging#

logging:
  level: INFO                # DEBUG, INFO, WARNING, ERROR

Full Example#

A minimal working config for a local filesystem run:

pipeline:
  name: dengue
  title: "Dengue Intelligence"

run:
  run_date: ""               # empty = today

storages:
  artifacts:
    kind: filesystem
    filesystem:
      base_path: "./artifacts"

data:
  case_download:
    enabled: true
    source_backend: filesystem
    source_path: datasets/raw_linelist_data/KA_linelist

  case_parse:
    region_types: [district]
    date_start: "2015-01-01"
    date_end: ""

  case_sufficiency:
    enabled: true
    min_total_rows: 30
    min_distinct_regions: 2
    min_date_span_days: 14

  geojson:
    base_path: datasets/geojsons

  weather_download:
    enabled: true
    source_mode: filesystem
    netcdf_cache_path: datasets/netcdf
    parsed_output_path: datasets/parsednetcdf
    region_type: district
    w_params: [t2m, d2m, tp]
    start_date: "2015-01-01"
    end_date: ""

thresholds:
  region_type: district
  n_weeks: 4
  historical_n_years: 5
  excluded_years: [2020, 2021]

model:
  spatial_res: district
  years_to_exclude: [2020, 2021]

report:
  output_dir: reports
  compile_pdf: false

email:
  enabled: false

logging:
  level: INFO