from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
# path to ClinVar (EVA) evidence dataset
# directory stored on your local machine
evidencePath = "local directory path - e.g. /User/downloads/sourceId=eva"
# establish spark connection
evd = spark.read.parquet(evidencePath)
# Browse the evidence schema
# select fields of interest
F.explode("clinicalSignificances").alias("cs"),
# +---------------+--------------+-----------+------------+--------------------+--------------------+
# | targetId| diseaseId|variantRsId| studyId| cs| confidence|
# +---------------+--------------+-----------+------------+--------------------+--------------------+
# |ENSG00000153201|Orphanet_88619|rs773278648|RCV001042548|uncertain signifi...|criteria provided...|
# |ENSG00000115718| Orphanet_745| null|RCV001134697|uncertain signifi...|criteria provided...|
# |ENSG00000107147| HP_0001250|rs539139475|RCV000720408| likely benign|criteria provided...|
# |ENSG00000175426|Orphanet_71528|rs142567487|RCV000292648|uncertain signifi...|criteria provided...|
# |ENSG00000169174| EFO_0004911|rs563024336|RCV000375546|uncertain signifi...|criteria provided...|
# |ENSG00000140521| Orphanet_298|rs376306906|RCV000763992|uncertain signifi...|criteria provided...|
# |ENSG00000134982| EFO_0005842| rs74627407|RCV000073743| other|no assertion crit...|
# |ENSG00000187498| MONDO_0008289|rs146288748|RCV001111533|uncertain signifi...|criteria provided...|
# |ENSG00000116688|Orphanet_64749|rs119103265|RCV000857104|uncertain signifi...|no assertion crit...|
# |ENSG00000133812|Orphanet_99956|rs562275980|RCV000367609|uncertain signifi...|criteria provided...|
# +---------------+--------------+-----------+------------+--------------------+--------------------+
# only showing top 10 rows
# Convert to a Pandas Dataframe