delta lake and athena external tables

sourced_landing_file = f"s3a://{bucket}/{s3_file_path}"
silver_file = f"s3a://{bucket}/stg/{s3_file_path}"
gold_file = f"s3a://{bucket}/lake/{s3_file_path}"
import datetime
from delta.tables import *
from pyspark.sql.functions import lit
etl_ts_utc = f'{datetime.datetime.utcnow().isoformat(timespec="microseconds")}Z'df_input_raw = spark.read.json(sourced_landing_file).withColumn("etl_ts_utc", lit(etl_ts_utc))
df_input_raw.count()
df_input_raw.createOrReplaceTempView('my_table')
spark.sql("""SELECT * FROM my_table""").show()
my_query = """
select id,name,createdAt, <etc> from my_table
where someId is null
"""
part_of_data_df = spark.sql(my_query)
part_of_data_df.show()
part_of_data_df.count()
part_of_data_df.printSchema()
part_of_data_df.write.format("delta").save(<s3-path-to-save-delta-files>)
Py4JJavaError: An error occurred while calling o1006.save. : org.apache.spark.sql.AnalysisException: Incompatible format detected.  You are trying to write to `s3a://<s3-bucket>/<s3-path>/` using Databricks Delta, but there is no transaction log present. Check the upstream job to make sure that it is writing using format("delta") and that you are trying to write to the table base path.
%sqlGENERATE symlink_format_manifest FOR TABLE delta.`s3a://<data-lake-file-path>`
spark.databricks.hive.metastore.glueCatalog.enabled true
%sqluse `<database-name>`
%sqlCREATE EXTERNAL TABLE `<table-name>` (
id string,
name string,
createdAt string,
processedAt string,
updatedAt string,
email string,
subtotalLineItemsQuantity BIGINT,
customer struct<`firstName`:string,id:string,lastName:string,note:string,ordersCount:string>,
cancelReason string,
cancelledAt string,
displayFinancialStatus string,
displayFulfillmentStatus string,
originalTotalPriceSet struct <`shopMoney`:struct<`amount`:string,currencyCode:string>>,
netPaymentSet struct <`shopMoney`:struct<`amount`:string,currencyCode:string>>,
totalTaxSet struct <`shopMoney`:struct<`amount`:string,currencyCode:string>>,
refunds array<struct<`createdAt`:string,id:string,note:string,totalRefundedSet:struct <`shopMoney`:struct<`amount`:string, currencyCode:string>>,updatedAt:string>>,
transactions array<struct<`accountNumber`:string,gateway:string,kind:string,processedAt:string,status:string>>,
tags array<string>,
note string,
customAttributes array<struct<`key`:string,value:string>>,
billingAddress struct<`city`:string,country:string,provinceCode:string,zip:string>,
shippingAddress struct<`city`:string,country:string,provinceCode:string,zip:string>
)
PARTITIONED BY (<partition-field-name> string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://s3-path-to-save-delta-files/_symlink_format_manifest/'
Error in SQL statement: AnalysisException: Found duplicate column(s) in the table definition of `database_name`.`table_name`: `field_name`;
spark.sql(f'MSCK REPAIR TABLE `{database-name}`.`{table-name}`')

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store