After working with jupyter notebooks for a while, I faced an annoying boilerplate code migrating from notebook to notebook:
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
clickhouse = create_engine(…)
spark = SparkSession.builder.appName(app_name).getOrCreate()
minio = boto3.client("s3")
mlflow.set_registry_uri("databricks-uc")
mlflow_client = MlflowClient()and it goes on and on.
The solution is to structure application creation using application factory and extract connection to external resources into the application instance.
from dataclasses import dataclass
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import *
import boto3
from pyspark.sql import SparkSession
import mlflow
from mlflow.tracking import MlflowClient
from data_sources.core.connection import DruidConnection
from data_sources.core.connection import ClickhouseEngine
class App:
spark: SparkSession
druid: DruidConnection
minio: object
mlflow: MlflowClient
ch: ClickhouseEngine
home: Path
def current_user(self):
return self.spark.sql("SELECT current_user()").toPandas().iloc[0][0]
def create_app(app_name="default_app"):
app = App()
app.spark = SparkSession.builder.appName(app_name).getOrCreate()
app.druid = DruidConnection()
app.minio = boto3.client("s3")
mlflow.set_registry_uri("databricks-uc")
app.mlflow = MlflowClient()
app.ch = ClickhouseEngine("ch-main")
app.ch.init_app(app)
app.home = Path("/Workspace") / "Users" / self.current_user()
# yes, these goodies belong here
pd.options.display.float_format = '{:.2f}'.format
return appthen in databricks notebooks:
%run app.py app = create_app()
or in jupyter notebooks:
from app import * app = create_app()
As a result, all imports from app.py will be available in the current notebook and it's a single line to initialize all connections at once.