This case is most comparable to local training: The training data set is read from a large file in the cloud. This often happens with tabular data in whose case the entire data set is available in storage formats such as CSV, Parquet etc.
The code below shows a way to flexibly read data in any file format from cloud storage. The BlobHandler class manages operations for reading from the cloud, while the StreamConverter class determines how the given file format is interpreted and converted (the file format here is CSV and converted into a Pandas DataFrame).
from abc import ABC, abstractmethod
from typing import Union, Iterable
from google.cloud.storage import Client, Blob
def from_stream(self, stream: io.BytesIO) -> Iterable:
def __init__(self, bucket: str, io_handler: StreamConverter):
self.client = Client()
self.bucket = self.client.get_bucket(bucket)
self.io_handler = io_handler
def read_from_blob(self, blob_or_uri: Union[str, Blob]) -> Iterable:
with io.BytesIO() as stream:
out = self.io_handler.from_stream(stream)
import pandas as pd
def from_stream(self, stream: io.BytesIO) -> pd.DataFrame: