After working with jupyter notebooks for a while, I faced an annoying boilerplate code migrating from notebook to notebook:
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
clickhouse = create_engine(…)
spark = SparkSession.builder.appName(app_name).getOrCreate()
minio = boto3.client("s3")
mlflow.set_registry_uri("databricks-uc")
mlflow_client = MlflowClient()and it goes on and on.
The solution is to structure application creation using application factory and extract connection to external resources into the application instance.
from dataclasses import dataclass
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import *
import boto3
from pyspark.sql import SparkSession
import mlflow
from mlflow.tracking import MlflowClient
from data_sources.core.connection import DruidConnection
from data_sources.core.connection import ClickhouseEngine
class App:
    spark: SparkSession
    druid: DruidConnection
    minio: object
    mlflow: MlflowClient
    ch: ClickhouseEngine
    home: Path
    def current_user(self):
        return self.spark.sql("SELECT current_user()").toPandas().iloc[0][0]
def create_app(app_name="default_app"):
    app = App()
    app.spark = SparkSession.builder.appName(app_name).getOrCreate()
    app.druid = DruidConnection()
    app.minio = boto3.client("s3")
    mlflow.set_registry_uri("databricks-uc")
    app.mlflow = MlflowClient()
    app.ch = ClickhouseEngine("ch-main")
    app.ch.init_app(app)
    app.home = Path("/Workspace") / "Users" /  self.current_user()
    # yes, these goodies belong here
    pd.options.display.float_format = '{:.2f}'.format
    return appthen in databricks notebooks:
%run app.py app = create_app()
or in jupyter notebooks:
from app import * app = create_app()
As a result, all imports from app.py will be available in the current notebook and it's a single line to initialize all connections at once.