Skip to content

conditions

condition_and(*conditions)

Combines multiple conditions using logical AND.

Parameters:

Name Type Description Default
*conditions ColumnOrName

Multiple PySpark Column objects or SQL expression strings representing conditions.

()

Returns:

Name Type Description
Column Column

A single PySpark Column object representing the combined condition.

Examples:

>>> condition_and(F.col('col1') > 1, F.col('col2') < 5)
Column<'((col1 > 1) AND (col2 < 5))'>
>>> condition_and(F.col('col1') > 1, "col2 < 5")
Column<'((col1 > 1) AND (col2 < 5))'>
Source code in pysparky/functions/conditions.py
def condition_and(*conditions: ColumnOrName) -> Column:
    """
    Combines multiple conditions using logical AND.

    Args:
        *conditions (ColumnOrName): Multiple PySpark Column objects or SQL expression strings representing conditions.

    Returns:
        Column: A single PySpark Column object representing the combined condition.

    Examples:
        >>> condition_and(F.col('col1') > 1, F.col('col2') < 5)
        Column<'((col1 > 1) AND (col2 < 5))'>

        >>> condition_and(F.col('col1') > 1, "col2 < 5")
        Column<'((col1 > 1) AND (col2 < 5))'>
    """
    parsed_conditions = [
        F.expr(cond) if isinstance(cond, str) else cond for cond in conditions
    ]
    return reduce(and_, parsed_conditions, F.lit(True))

condition_or(*conditions)

Combines multiple conditions using logical OR.

Parameters:

Name Type Description Default
*conditions ColumnOrName

Multiple PySpark Column objects or SQL expression strings representing conditions.

()

Returns:

Name Type Description
Column Column

A single PySpark Column object representing the combined condition.

Examples:

>>> condition_or(F.col('col1') > 1, F.col('col2') < 5)
Column<'((col1 > 1) OR (col2 < 5))'>
>>> condition_or(F.col('col1') > 1, "col2 < 5")
Column<'((col1 > 1) OR (col2 < 5))'>
Source code in pysparky/functions/conditions.py
def condition_or(*conditions: ColumnOrName) -> Column:
    """
    Combines multiple conditions using logical OR.

    Args:
        *conditions (ColumnOrName): Multiple PySpark Column objects or SQL expression strings representing conditions.

    Returns:
        Column: A single PySpark Column object representing the combined condition.

    Examples:
        >>> condition_or(F.col('col1') > 1, F.col('col2') < 5)
        Column<'((col1 > 1) OR (col2 < 5))'>

        >>> condition_or(F.col('col1') > 1, "col2 < 5")
        Column<'((col1 > 1) OR (col2 < 5))'>
    """
    parsed_conditions = [
        F.expr(cond) if isinstance(cond, str) else cond for cond in conditions
    ]
    return reduce(or_, parsed_conditions, F.lit(False))

is_all_numbers_only(column_or_name)

Checks if the given column or string contains only numeric characters.

Parameters:

Name Type Description Default
column_or_name ColumnOrName

The column or string to be checked.

required

Returns:

Name Type Description
Column Column

A column of boolean values indicating whether each entry contains only numeric characters.

Examples:

>>> df = spark.createDataFrame([("123",), ("4567",), ("89a",), ("",), ("0",)], ["value"])
>>> df.select(all_numbers(df["value"]).alias("is_all_numbers")).show()
+-------------+
|is_all_numbers|
+-------------+
|         true|
|         true|
|        false|
|        false|
|         true|
+-------------+
Source code in pysparky/functions/conditions.py
def is_all_numbers_only(column_or_name) -> Column:
    """
    Checks if the given column or string contains only numeric characters.

    Args:
        column_or_name (ColumnOrName): The column or string to be checked.

    Returns:
        Column: A column of boolean values indicating whether each entry contains only numeric characters.

    Examples:
        >>> df = spark.createDataFrame([("123",), ("4567",), ("89a",), ("",), ("0",)], ["value"])
        >>> df.select(all_numbers(df["value"]).alias("is_all_numbers")).show()
        +-------------+
        |is_all_numbers|
        +-------------+
        |         true|
        |         true|
        |        false|
        |        false|
        |         true|
        +-------------+
    """
    return is_n_numbers_only(column_or_name, n="+")

is_n_character_only(column_or_name, n)

Checks if the given column or string contains exactly n alphabetic characters.

Parameters:

Name Type Description Default
column_or_name Column

The column or string to be checked.

required
n int

The exact number of alphabetic characters to match.

required

Returns:

Name Type Description
Column Column

A column of boolean values indicating whether each entry matches the regular expression.

Source code in pysparky/functions/conditions.py
def is_n_character_only(column_or_name: ColumnOrName, n: int) -> Column:
    """
    Checks if the given column or string contains exactly `n` alphabetic characters.

    Args:
        column_or_name (Column): The column or string to be checked.
        n (int): The exact number of alphabetic characters to match.

    Returns:
        Column: A column of boolean values indicating whether each entry matches the regular expression.
    """
    # double curly braces {{ }} to escape the braces in the f-string
    regexp = rf"^[a-zA-Z]{{{n}}}$"
    return F.regexp_like(column_or_name, regexp=F.lit(regexp))

is_n_numbers_only(column_or_name, n)

Checks if the given column or string contains exactly n numeric characters.

Parameters:

Name Type Description Default
column_or_name ColumnOrName

The column or string to be checked.

required
n int | str

The exact number of numeric characters to match. or "+" for any length number.

required

Returns:

Name Type Description
Column Column

A column of boolean values indicating whether each entry matches the regular expression.

Examples:

>>> df = spark.createDataFrame([("123",), ("4567",), ("89a",), ("",), ("0",)], ["value"])
>>> df.select(n_numbers_only(df["value"], 3).alias("is_n_numbers")).show()
+-------------+
|is_n_numbers |
+-------------+
|         true|
|        false|
|        false|
|        false|
|        false|
+-------------+
Source code in pysparky/functions/conditions.py
def is_n_numbers_only(column_or_name: ColumnOrName, n: int | str) -> Column:
    """
    Checks if the given column or string contains exactly `n` numeric characters.

    Args:
        column_or_name (ColumnOrName): The column or string to be checked.
        n (int | str): The exact number of numeric characters to match. or "+" for any length number.

    Returns:
        Column: A column of boolean values indicating whether each entry matches the regular expression.

    Examples:
        >>> df = spark.createDataFrame([("123",), ("4567",), ("89a",), ("",), ("0",)], ["value"])
        >>> df.select(n_numbers_only(df["value"], 3).alias("is_n_numbers")).show()
        +-------------+
        |is_n_numbers |
        +-------------+
        |         true|
        |        false|
        |        false|
        |        false|
        |        false|
        +-------------+
    """
    if isinstance(n, int):
        # double curly braces {{ }} to escape the braces in the f-string
        regexp = rf"^\d{{{n}}}$"
    elif n == "+":
        # Any length number
        regexp = r"^\d+$"
    else:
        raise ValueError(
            "The parameter 'n' must be either an integer or the string '+'."
        )
    return F.regexp_like(column_or_name, F.lit(regexp))

is_printable_only(column_or_name)

Checks if the given column or string contains only printable characters.

Parameters:

Name Type Description Default
column_or_name ColumnOrName

The column or string to be checked.

required

Returns:

Name Type Description
Column Column

A column of boolean values indicating whether each entry contains only printable characters.

Examples:

>>> df = spark.createDataFrame([("Hello!",), ("World",), ("123",), ("",), ("Non-printable",)], ["value"])
>>> df.select(printable_only(df["value"]).alias("is_printable")).show()
+-------------+
|is_printable |
+-------------+
|         true|
|         true|
|         true|
|        false|
|        false|
+-------------+
Source code in pysparky/functions/conditions.py
def is_printable_only(column_or_name: ColumnOrName) -> Column:
    """
    Checks if the given column or string contains only printable characters.

    Args:
        column_or_name (ColumnOrName): The column or string to be checked.

    Returns:
        Column: A column of boolean values indicating whether each entry contains only printable characters.

    Examples:
        >>> df = spark.createDataFrame([("Hello!",), ("World",), ("123",), ("",), ("Non-printable\x01",)], ["value"])
        >>> df.select(printable_only(df["value"]).alias("is_printable")).show()
        +-------------+
        |is_printable |
        +-------------+
        |         true|
        |         true|
        |         true|
        |        false|
        |        false|
        +-------------+
    """
    # Regular expression for printable ASCII characters (0x20 to 0x7E)
    regexp = r"^[\x20-\x7E]+$"
    return F.regexp_like(column_or_name, F.lit(regexp))

is_two_character_only(column_or_name)

Checks if the given column or string contains exactly two alphabetic characters (either lowercase or uppercase).

Parameters:

Name Type Description Default
column_or_name ColumnOrName

The column or string to be checked.

required

Returns:

Name Type Description
Column Column

A boolean column indicating whether the input matches the pattern of exactly two alphabetic characters.

Examples:

>>> df = spark.createDataFrame([("aa",), ("ZZ",), ("a1",), ("abc",)], ["value"])
>>> df.select(two_character_only(df["value"]).alias("is_two_char")).show()
+-----------+
|is_two_char|
+-----------+
|       true|
|       true|
|      false|
|      false|
+-----------+
Source code in pysparky/functions/conditions.py
def is_two_character_only(column_or_name: ColumnOrName) -> Column:
    """
    Checks if the given column or string contains exactly two alphabetic characters (either lowercase or uppercase).

    Args:
        column_or_name (ColumnOrName): The column or string to be checked.

    Returns:
        Column: A boolean column indicating whether the input matches the pattern of exactly two alphabetic characters.

    Examples:
        >>> df = spark.createDataFrame([("aa",), ("ZZ",), ("a1",), ("abc",)], ["value"])
        >>> df.select(two_character_only(df["value"]).alias("is_two_char")).show()
        +-----------+
        |is_two_char|
        +-----------+
        |       true|
        |       true|
        |      false|
        |      false|
        +-----------+
    """
    return is_n_character_only(column_or_name, n=2)

startswiths(column_or_name, list_of_strings)

Creates a PySpark Column expression to check if the given column starts with any string in the list.

Parameters:

Name Type Description Default
column_or_name ColumnOrName

The column to check.

required
list_of_strings List[str]

A list of strings to check if the column starts with.

required

Returns:

Name Type Description
Column Column

A PySpark Column expression that evaluates to True if the column starts with any string in the list, otherwise False.

Source code in pysparky/functions/conditions.py
@decorator.extension_enabler(Column)
def startswiths(
    column_or_name: ColumnOrName, list_of_strings: list[str]
) -> pyspark.sql.Column:
    """
    Creates a PySpark Column expression to check if the given column starts with any string in the list.

    Args:
        column_or_name (ColumnOrName): The column to check.
        list_of_strings (List[str]): A list of strings to check if the column starts with.

    Returns:
        Column: A PySpark Column expression that evaluates to True if the column starts with any string in the list, otherwise False.
    """
    (column,) = ensure_column(column_or_name)

    return reduce(
        or_,
        map(column.startswith, list_of_strings),
        F.lit(False),
    ).alias(f"startswiths_len{len(list_of_strings)}")