import sys
import datetime
import re
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
job = Job(glueContext)
## DONT FORGET TO PUT IN YOUR INPUT AND OUTPUT LOCATIONS BELOW.
your_database_name = "yourname-datalake-db"
your_table_name = "reviews"
output_location = "s3://yourname-0000-datalake/parquet/reviews/"
job.init("byod-workshop" + str(datetime.datetime.now().timestamp()))
#load our data from the catalog that we created with a crawler
dynamicF = glueContext.create_dynamic_frame.from_catalog(
database = your_database_name,
table_name = your_table_name,
transformation_ctx = "dynamicF")
# invalid characters in column names are replaced by _
df = dynamicF.toDF()
def canonical(x): return re.sub("[ ,;{}()\n\t=]+", '_', x.lower())
renamed_cols = [canonical(c) for c in df.columns]
df = df.toDF(*renamed_cols)
# write our dataframe in parquet format to an output s3 bucket
df.write.mode("overwrite").format("parquet").save(output_location)
job.commit()
In this step we have created a job to convert our data from CSV to Parquet format for the data of the reviews table. Next we will create a job crawler to discover and store the metadata of the dataset in the form of the parquet we just converted, then save it to the Glue data catalog.