Tag: Google sheets

  • Save Google Sheets data into BigQuery and MySQL

    Read data from Google Sheets and save a worksheet into BigQuery and MySQL.

    Steps:

    1. Upload service key file into Google Colab
    2. Use gspread to extract data from worksheet
    3. Save result as pandas dataframe
    4. create pyspark dataframe
    5. write dataframe into MySQL and BigQuery

    Writing data into MySQL

    See the previous article – Google CoLab and relational database Spark Context details and how to get data from MySQL database.

    from pyspark.sql import SparkSession
    spark = SparkSession.builder \
      .appName('BigQuery')\
      .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
      .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
      .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", GOOGLE_APPLICATION_CREDENTIALS) \
      .config("spark.jars", ",".join(sparkjars))\
      .enableHiveSupport()\
      .getOrCreate()

    Getting data from Google Sheets

    import pandas as pd
    import gspread
    import io
     
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from oauth2client.service_account import ServiceAccountCredentials
    from google.colab import files 
    
    # upload service key and rename/copy it as "service.key" in Google CoLab
    
    # Init Google (with Auth)
    file_path = r"service.json" 
    scope = ['https://spreadsheets.google.com/feeds',
             'https://www.googleapis.com/auth/drive']
     
    credentials = ServiceAccountCredentials.from_json_keyfile_name(file_path, scope)
     
    # Used to auth Google Sheets
    gc = gspread.authorize(credentials)
     
    # Used to get all files in Google Drive folder
    gauth = GoogleAuth()
    gauth.credentials = credentials
    drive = GoogleDrive(gauth)
    
    # open spreadsheet and extract data
    id = '12LXnqs5vwoVj7rDjmlntO_V4LAeEGG5wM-whjEnzgo4'
    wb = gc.open_by_key(id)
    sheet1 = wb.worksheet('A4. Base Data') # Enter your sheet name.
    original_df = sheet1.get('A2:J2803335')
    df = pd.DataFrame(original_df,columns=['DriverNo','Make','Model','Description','RegisteredDate','EngineCC','Fuel','Co2','FinanceCost','FinanceHouseName'])

    Save dataframe to BigQuery

    #Create PySpark DataFrame from Pandas
    invoice=spark.createDataFrame(df) 
    invoice.printSchema()
    invoice.show()
    
    # Saving data into BigQuery
    invoice.write \
      .format("bigquery") \
      .option("temporaryGcsBucket", "buckeet_name/temp/") \
      .option("parentProject", "quickstart-25853678031891") \
      .option("dataset", "ellodb") \
      .save("quickstart-25853678031891.ellodb.invoice")

    Read data from BigQuery

    # read data from BigQuery
    stmt = """select * from `quickstart-25853678031891.ellodb.invoice`"""
    df = (spark.read.format("bigquery")
          .option("parentProject", "quickstart-25853678031891")
          .option("project", "ellodb")
          .option("materializationDataset", "ellodb_tmp")
          .option("query",stmt).load()
         )
    broadoakdata.uk

    Ref: Google CoLab