Python tools — case 1
1. Preparing a sheet
2. Creating a new python tool
import logging
import pandas as pd
import time
import datetime
from kywy.client.kawa_decorators import kawa_tool
log = logging.getLogger('script-logger')
NUM_DAYS_INCREMENT=3
NUM_DAYS_FULL=300
@kawa_tool(
outputs={
'key': str,
'date': datetime.date,
'avgPrice': float,
'totalQuantity': float
},
parameters={
'source_sheet_id': {'type':str}
})
def sync_data(kawa, source_sheet_id, data_preview=False, append=False):
start=time.time()
log.info('Python etl example with computation')
num_days = NUM_DAYS_INCREMENT if data_preview or append else NUM_DAYS_FULL
from_date = datetime.date.today() - datetime.timedelta(days=num_days)
#
# This step loads data from the data warehouse.
# Use filters and group_by to limit as much as possible the
# data that will be processed by your script.
# This ensures that your process scales.
#
log.info(f'Loading data from:{from_date}')
query = (kawa
.sheet(sheet_id=source_sheet_id)
.select(
# Adjust to the names of the columns of your sheet
# (Make sure to adjust the filter as well)
kawa.col('key').first().alias('key'),
kawa.col('date').first().alias('date'),
kawa.col('price').avg().alias('avgPrice'),
kawa.col('quantity').sum().alias('totalQuantity'),
)
.filter(kawa.col('date').date_range(from_inclusive=from_date))
#
# Feel free to add stages to your query
#
.group_by('key')
.no_limit())
#
# From this point on, you work with a regular pandas dataframe.
# Feel free to add any more advanced transformation on it
#
df= query.compute()
#
# The important thing is to return a dataframe with the columns
# declared in the output section of the @kawa_tool decorator.
#
elapsed=time.time() - start
log.info(f'Your transformation was performed in {elapsed}s', )
return df
3. Create your data source


4. Configure scheduling


Last updated
Was this helpful?

