Source code for doespy.etl.steps.loaders

from abc import ABC, abstractmethod
from typing import List, Dict

import pandas as pd
import os
import matplotlib.pyplot as plt
import inspect
import sys
from typing import Optional

from pydantic import BaseModel

class Loader(BaseModel, ABC):
    class Config:
        extra = "forbid"

    def get_output_dir(self, etl_info):
        """:meta private:"""

        etl_output_dir = etl_info["etl_output_dir"]

        if etl_output_dir is not None and not os.path.exists(etl_output_dir):
            os.makedirs(etl_output_dir)

        return etl_output_dir

    @abstractmethod
    def load(self, df: pd.DataFrame, options: Dict, etl_info: Dict) -> None:
        """:meta private:"""
        # NOTE: Extending classes should not use the `options: Dict` and instead use instance variables for parameters

        pass


class PlotLoader(Loader):

    def save_data(self, df: pd.DataFrame, filename: str, output_dir: str, output_filetypes: List[str] = ["html"]):
        """:meta private:"""
        os.makedirs(output_dir, exist_ok=True)

        for ext in output_filetypes:
            if ext == "html":
                html_table = df.to_html()
                path = os.path.join(output_dir, f"{filename}.html")

                with open(path, 'w') as file:
                    file.write(html_table)
            else:
                raise ValueError(f"PlotLoader: Unknown file type {ext}")


    def save_plot(
        self,
        fig: plt.Figure,
        filename: str,
        output_dir: str,
        use_tight_layout: bool = True,
        output_filetypes: List[str] = ["pdf", "png"],
    ):
        """:meta private:"""
        os.makedirs(output_dir, exist_ok=True)


        for ext in output_filetypes:
            full_filename = f"{output_dir}/{filename}.{ext}"

            bbox_inches = "tight" if (use_tight_layout and ext == "pdf") else None
            pad_inches = 0 if (use_tight_layout and ext == "pdf") else 0.1
            dpi = 300
            fig.savefig(
                full_filename,
                format=ext,
                bbox_inches=bbox_inches,
                pad_inches=pad_inches,
                dpi=dpi,
            )

    def default_fig(self):
        """:meta private:"""
        scale_factor = 2.4
        figsize = [
            scale_factor * 1.618,
            scale_factor * 1,
        ]  # [width, height] based on golden ratio
        fig = plt.figure(figsize=figsize, dpi=100)
        plt.figure(fig.number)
        return fig


[docs]class CsvSummaryLoader(Loader): """ The `CsvSummaryLoader` creates a CSV file of the data frame from the `Transformer` stage. .. code-block:: yaml :caption: Example ETL Pipeline Design $ETL$: loaders: CsvSummaryLoader: {} # with default output dir CsvSummaryLoader: # with skip empty df skip_empty: True """ skip_empty: bool = False """Ignore empty df, if set to ``False``, raises an error if the data frame is empty.""" def load(self, df: pd.DataFrame, options: Dict, etl_info: Dict) -> None: if self.skip_empty and df.empty: return elif df.empty: raise ValueError("CsvSummaryLoader: DataFrame is empty so not creating an output file.") else: output_dir = self.get_output_dir(etl_info) df.to_csv(os.path.join(output_dir, f"{etl_info['pipeline']}.csv"))
[docs]class PickleSummaryLoader(Loader): """The `PickleSummaryLoader` creates a Pickle file of the data frame from the `Transformer` stage. .. code-block:: yaml :caption: Example ETL Pipeline Design $ETL$: loaders: PickleSummaryLoader: {} # with default output dir PickleSummaryLoader: # with skip empty dir skip_empty: True """ skip_empty: bool = False """Ignore empty df, if set to ``False``, raises an error if the data frame is empty.""" def load(self, df: pd.DataFrame, options: Dict, etl_info: Dict) -> None: if self.skip_empty and df.empty: return elif df.empty: raise ValueError("PickleSummaryLoader: DataFrame is empty so not creating an output file.") else: output_dir = self.get_output_dir(etl_info) df.to_pickle(os.path.join(output_dir, f"{etl_info['pipeline']}.pkl"))
[docs]class LatexTableLoader(Loader): """The `LatexTableLoader` creates a tex file of the data frame from the `Transformer` stage formatted as a Latex table. .. code-block:: yaml :caption: Example ETL Pipeline Design $ETL$: loaders: LatexTableLoader: {} # with default output dir LatexTableLoader: # with skip empty df skip_empty: True """ skip_empty: bool = False """Ignore empty df, if set to ``False``, raises an error if the data frame is empty.""" def load(self, df: pd.DataFrame, options: Dict, etl_info: Dict) -> None: if self.skip_empty and df.empty: return elif df.empty: raise ValueError("LatexTableLoader: DataFrame is empty so not creating an output file.") output_dir = self.get_output_dir(etl_info) with open( os.path.join(output_dir, f"{etl_info['pipeline']}.tex"), "w" ) as file: df.to_latex(buf=file)
__all__ = [name for name, cl in inspect.getmembers(sys.modules[__name__], inspect.isclass) if name!="Loader" and issubclass(cl, Loader) ]