The filter() function in PySpark is used to create a new DataFrame by selecting rows that meet a specified condition or SQL expression. Alternatively, the where() function can be used in place of filter(), as both functions perform the same operation. They return a new DataFrame that includes only the rows that satisfy the given condition.
from pyspark.sql.types import StructType,StructField
from pyspark.sql.types import StringType, IntegerType, ArrayType
# Create data
data = ,"OH","M"),
(("Anna","Rose",""),,"NY","F"),
(("Julia","","Williams"),,"OH","F"),
(("Maria","Anne","Jones"),,"NY","M"),
(("Jen","Mary","Brown"),,"NY","M"),
(("Mike","Mary","Williams"),,"OH","M")
]
# Create schema
schema = StructType()),
StructField('languages', ArrayType(StringType()), True),
StructField('state', StringType(), True),
StructField('gender', StringType(), True)
])
# Create dataframe
df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate=False)




# Using array_contains()
from pyspark.sql.functions import array_contains
df.filter(array_contains(df.languages,"Java")) \
.show(truncate=False)
