PySpark Join is used to combine two DataFrames and by chaining these you can join multiple DataFrames.
# Syntax
join(self, other, on=None, how=None)
| Join String | Equivalent SQL Join |
| inner | INNER JOIN |
| outer, full, fullouter, full_outer | FULL OUTER JOIN |
| left, leftouter, left_outer | LEFT JOIN |
| right, rightouter, right_outer | RIGHT JOIN |
| cross | |
| anti, leftanti, left_anti | A Left Semi Join is used to filter the left DataFrame by keeping only the rows that have matching keys in the right DataFrame. Unlike an inner join, a left semi join does not include columns from the right DataFrame in the result. |
| semi, leftsemi, left_semi | Is used to filter the left DataFrame by keeping only the rows that do not have matching keys in the right DataFrame. This is useful for identifying records in the left DataFrame that are absent in the right DataFrame. |
from pyspark.sql import Row
# Create data for Employees DataFrame
employee_data = [
Row(emp_id=1, emp_name='Alice', dept_id=101),
Row(emp_id=2, emp_name='Bob', dept_id=102),
Row(emp_id=3, emp_name='Catherine', dept_id=101),
Row(emp_id=4, emp_name='David', dept_id=103),
Row(emp_id=5, emp_name='David', dept_id=105)
]
# Create data for Departments DataFrame
department_data = [
Row(dept_id=101, dept_name='HR'),
Row(dept_id=102, dept_name='Finance'),
Row(dept_id=103, dept_name='IT'),
Row(dept_id=104, dept_name='Marketing')
]
# Create DataFrames
employees_df = spark.createDataFrame(employee_data)
departments_df = spark.createDataFrame(department_data)
employees_df.show()
departments_df.show()



