After working with jupyter notebooks for a while, I faced an annoying boilerplate code migrating from notebook to notebook:
import pandas as pd import numpy as np import seaborn as sns from pyspark.sql import SparkSession clickhouse = create_engine(…) spark = SparkSession.builder.appName(app_name).getOrCreate() minio = boto3.client("s3") mlflow.set_registry_uri("databricks-uc") mlflow_client = MlflowClient()
and it goes on and on.
The solution is to structure application creation using application factory and extract connection to external resources into the application instance.
from dataclasses import dataclass import pandas as pd import numpy as np import seaborn as sns from pathlib import * import boto3 from pyspark.sql import SparkSession import mlflow from mlflow.tracking import MlflowClient from data_sources.core.connection import DruidConnection from data_sources.core.connection import ClickhouseEngine class App: spark: SparkSession druid: DruidConnection minio: object mlflow: MlflowClient ch: ClickhouseEngine home: Path def current_user(self): return self.spark.sql("SELECT current_user()").toPandas().iloc[0][0] def create_app(app_name="default_app"): app = App() app.spark = SparkSession.builder.appName(app_name).getOrCreate() app.druid = DruidConnection() app.minio = boto3.client("s3") mlflow.set_registry_uri("databricks-uc") app.mlflow = MlflowClient() app.ch = ClickhouseEngine("ch-main") app.ch.init_app(app) app.home = Path("/Workspace") / "Users" / self.current_user() # yes, these goodies belong here pd.options.display.float_format = '{:.2f}'.format return app
then in databricks notebooks:
%run app.py app = create_app()
or in jupyter notebooks:
from app import * app = create_app()
As a result, all imports from app.py will be available in the current notebook and it's a single line to initialize all connections at once.