Application factory pattern in jupyter notebooks

After working with jupyter notebooks for a while, I faced an annoying boilerplate code migrating from notebook to notebook:

import pandas as pd
import numpy as np
import seaborn as sns

from pyspark.sql import SparkSession

clickhouse = create_engine(…)
spark = SparkSession.builder.appName(app_name).getOrCreate()

minio = boto3.client("s3")

mlflow.set_registry_uri("databricks-uc")
mlflow_client = MlflowClient()

and it goes on and on.

The solution is to structure application creation using application factory and extract connection to external resources into the application instance.

from dataclasses import dataclass
import pandas as pd
import numpy as np
import seaborn as sns

from pathlib import *

import boto3
from pyspark.sql import SparkSession
import mlflow
from mlflow.tracking import MlflowClient

from data_sources.core.connection import DruidConnection
from data_sources.core.connection import ClickhouseEngine

class App:
    spark: SparkSession
    druid: DruidConnection
    minio: object
    mlflow: MlflowClient
    ch: ClickhouseEngine
    home: Path

    def current_user(self):
        return self.spark.sql("SELECT current_user()").toPandas().iloc[0][0]

def create_app(app_name="default_app"):
    app = App()
    app.spark = SparkSession.builder.appName(app_name).getOrCreate()
    app.druid = DruidConnection()
    app.minio = boto3.client("s3")

    mlflow.set_registry_uri("databricks-uc")
    app.mlflow = MlflowClient()

    app.ch = ClickhouseEngine("ch-main")
    app.ch.init_app(app)

    app.home = Path("/Workspace") / "Users" /  self.current_user()
    # yes, these goodies belong here
    pd.options.display.float_format = '{:.2f}'.format
    return app

then in databricks notebooks:

%run app.py

app = create_app()

or in jupyter notebooks:

from app import *

app = create_app()

As a result, all imports from app.py will be available in the current notebook and it's a single line to initialize all connections at once.