Getting Started with PySpark: Difference between revisions

(Created page with "<pre> from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "som...")
 
No edit summary
Line 15: Line 15:
                     format="csv", sep=",", inferSchema="true", header="true")</pre>
                     format="csv", sep=",", inferSchema="true", header="true")</pre>


Find null columns
Find columns that are more than 90% null
<pre>null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
<pre>
to_drop = [k for k, v in null_counts.items() if v > 0]</pre>
threshold = df.count() * .90
null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
to_drop = [k for k, v in null_counts.items() if v <= threshold ]
</pre>


Drop Null columns  
Drop Null columns  

Revision as of 11:09, 30 October 2019

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

import pyspark.sql.functions as F

Load some data

df = spark.read.load("DEX03s - 2019-10-07.csv",
                     format="csv", sep=",", inferSchema="true", header="true")

Find columns that are more than 90% null

threshold = df.count() * .90
null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
to_drop = [k for k, v in null_counts.items() if v <= threshold ]

Drop Null columns

clean = df.drop(*to_drop)
display(clean)