Tag: terraform

  • Load data into BigQuery using Terraform

    We will load data into BigQuery using Terraform. Our task:

    • create a Google Storage bucket with two folders for holding source and target data
    • create a dataset and two tables in BigQuery
    • Move data from local datasets to Google Storage and load data into respective BigQuery tables

    We will use Google Cloud Shell, a tool for managing resources hosted on Google Cloud Platform. It provides a VM with 5GB home directory will persist across sessions, but the VM is ephemeral and will be reset approximately 20 minutes after your session ends.

    Terraform commands

    • terraform init
    • terraform plan
    • terraform apply

    Steps

    • define required resources in a file using json format – I will name it bigquery.tf
    • create subfolder – data/source and data/target
    • place files containing data and metadata into respective folder
    • run terraform commands – init -> plan -> apply
    • use terraform destroy to destroy created resources

    Terraform for creating resources

    # need provide project_id
    provider "google" {
      project = "your_project_id"
    }
    
    resource "random_id" "storage" {
      byte_length = 4
      prefix      = "bi_"
    }
    
    resource "google_storage_bucket" "bi_bucket" {
        name          = random_id.storage.hex
        location      = "europe-west2"
        force_destroy = true
    
        lifecycle_rule {
            condition {
                age = 2
            }
            action {
                type = "Delete"
            }
        }
    }
    
    resource "google_storage_bucket_object" "companieshouse_source_csv" {
      name   = "source/companies_house.csv"
      source = "./data/source/companies_house.csv"
      bucket = random_id.storage.hex
      depends_on = [
        google_storage_bucket.bi_bucket,
      ]
    }
    
    resource "google_storage_bucket_object" "companieshouse_target_csv" {
      name   = "target/companies_house.csv"
      source = "./data/target/companies_house.csv"
      bucket = random_id.storage.hex
      depends_on = [
        google_storage_bucket.bi_bucket,
      ]
    }
    
    resource "google_bigquery_dataset" "default" {
        dataset_id                  = "bidb"
        friendly_name               = "bi dataset"
        description                 = "Ingest daily companies house data"
        location                    = "EU"
        default_table_expiration_ms = 10800000 // 3 hr
    
        depends_on = [
        google_storage_bucket_object.companieshouse_target_csv,
        google_storage_bucket_object.companieshouse_source_csv,
      ]
    }
    
    resource "google_bigquery_table" "companies_house" {
        dataset_id = google_bigquery_dataset.default.dataset_id
        table_id   = "companies_house"
        deletion_protection = false
    
        schema = file("./data/target/companies_house.json")
    
        external_data_configuration {
            autodetect    = true
            source_format = "CSV"
    
            csv_options {
                quote = "\""
                skip_leading_rows = 1
            }
    
            source_uris = [
                "${google_storage_bucket.bi_bucket.url}/target/companies_house.csv",
            ]
        }
    
        depends_on = [
            google_bigquery_dataset.default,
        ]
    }
    
    resource "google_bigquery_table" "companies_house_stg" {
        dataset_id = google_bigquery_dataset.default.dataset_id
        table_id   = "companies_house_stg"
        deletion_protection = false
    
        schema = file("./data/source/companies_house.json")
    
        external_data_configuration {
            autodetect    = true
            source_format = "CSV"
    
            csv_options {
                quote = "\""
                skip_leading_rows = 1
            }
    
            source_uris = [
                "${google_storage_bucket.bi_bucket.url}/source/companies_house.csv",
            ]
        }
    
        depends_on = [
            google_bigquery_dataset.default,
        ]
    }

    Metadata

    [
        {
            "name": "companynumber",
            "type": "STRING",
            "mode": "NULLABLE"
        },
        {
            "name": "companyname",
            "type": "STRING",
            "mode": "NULLABLE"
        },
        {
            "name": "legalstatus",
            "type": "STRING",
            "mode": "NULLABLE"
        },
        {
            "name": "incorporationdate",
            "type": "STRING",
            "mode": "NULLABLE"
        },
        {
            "name": "dissolutiondate",
            "type": "STRING",
            "mode": "NULLABLE"
        }
    ]

    Data file

    Source data
    companynumber,companyname,legalstatus,incorporationdate,dissolutiondate
    c1,"BigData Company",1,"2019-01-23",,
    c2,"Data Ltd",1,"2018-01-30",,
    c3,"Hello Data",1,"2017-02-21","2023-05-19",
    
    Target data
    companynumber,companyname,legalstatus,incorporationdate,dissolutiondate
    c1,"BigData Company",1,"2019-01-23",,
    c3,"Hello Data",1,"2017-02-21",,
    

    SQL for comparing datasets

    SELECT 
    companynumber,companyname,legalstatus,incorporationdate,dissolutiondate
    FROM `bidb.companies_house`
    EXCEPT DISTINCT
    SELECT 
    companynumber,companyname,legalstatus,incorporationdate,dissolutiondate
    FROM `bidb.companies_house_stg`;

    Screenshots

    Cloud Shell Teraform

    Destroy resources – terraform destroy