I am trying to reproduce the code as the Accessing and querying datasets with Python.
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
# path to ClinVar (EVA) evidence dataset
# directory stored on your local machine
evidencePath = "local directory path - e.g. /User/downloads/sourceId=eva"
# establish spark connection
spark = (
SparkSession.builder
.master('local[*]')
.getOrCreate()
)
# read evidence dataset
evd = spark.read.parquet(evidencePath)
# Browse the evidence schema
evd.printSchema()
# select fields of interest
evdSelect = (evd
.select("targetId",
"diseaseId",
"variantRsId",
"studyId",
F.explode("clinicalSignificances").alias("cs"),
"confidence")
)
evdSelect.show()
# +---------------+--------------+-----------+------------+--------------------+--------------------+
# | targetId| diseaseId|variantRsId| studyId| cs| confidence|
# +---------------+--------------+-----------+------------+--------------------+--------------------+
# |ENSG00000153201|Orphanet_88619|rs773278648|RCV001042548|uncertain signifi...|criteria provided...|
# |ENSG00000115718| Orphanet_745| null|RCV001134697|uncertain signifi...|criteria provided...|
# |ENSG00000107147| HP_0001250|rs539139475|RCV000720408| likely benign|criteria provided...|
# |ENSG00000175426|Orphanet_71528|rs142567487|RCV000292648|uncertain signifi...|criteria provided...|
# |ENSG00000169174| EFO_0004911|rs563024336|RCV000375546|uncertain signifi...|criteria provided...|
# |ENSG00000140521| Orphanet_298|rs376306906|RCV000763992|uncertain signifi...|criteria provided...|
# |ENSG00000134982| EFO_0005842| rs74627407|RCV000073743| other|no assertion crit...|
# |ENSG00000187498| MONDO_0008289|rs146288748|RCV001111533|uncertain signifi...|criteria provided...|
# |ENSG00000116688|Orphanet_64749|rs119103265|RCV000857104|uncertain signifi...|no assertion crit...|
# |ENSG00000133812|Orphanet_99956|rs562275980|RCV000367609|uncertain signifi...|criteria provided...|
# +---------------+--------------+-----------+------------+--------------------+--------------------+
# only showing top 10 rows
# Convert to a Pandas Dataframe
evdSelect.toPandas()
But I got this error somehow. Could anyone help me to solve this issue and tell me what is going wrong?
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
# path to ClinVar (EVA) evidence dataset
# directory stored on your local machine
evidencePath = "/User/xxxx/Desktop/sourceId=eva"
# establish spark connection
spark = (
SparkSession.builder
.master('local[*]')
.getOrCreate()
)
# read evidence dataset
evd = spark.read.parquet(evidencePath)
# Browse the evidence schema
evd.printSchema()
# select fields of interest
evdSelect = (evd
.select("targetId",
"diseaseId",
"variantRsId",
"studyId",
F.explode("clinicalSignificances").alias("cs"),
"confidence")
)
evdSelect.show()
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/var/folders/sg/84_1tjdd0z96ls5mpw9jp4hc0000gn/T/ipykernel_13805/3028577531.py in <module>
9 # establish spark connection
10 spark = (
---> 11 SparkSession.builder
12 .master('local[*]')
13 .getOrCreate()
~/opt/anaconda3/lib/python3.9/site-packages/pyspark/sql/session.py in getOrCreate(self)
267 sparkConf.set(key, value)
268 # This SparkContext may be an existing one.
--> 269 sc = SparkContext.getOrCreate(sparkConf)
270 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
271 # by all sessions.
~/opt/anaconda3/lib/python3.9/site-packages/pyspark/context.py in getOrCreate(cls, conf)
481 with SparkContext._lock:
482 if SparkContext._active_spark_context is None:
--> 483 SparkContext(conf=conf or SparkConf())
484 assert SparkContext._active_spark_context is not None
485 return SparkContext._active_spark_context
~/opt/anaconda3/lib/python3.9/site-packages/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
193 )
194
--> 195 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
196 try:
197 self._do_init(
~/opt/anaconda3/lib/python3.9/site-packages/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
415 with SparkContext._lock:
416 if not SparkContext._gateway:
--> 417 SparkContext._gateway = gateway or launch_gateway(conf)
418 SparkContext._jvm = SparkContext._gateway.jvm
419
~/opt/anaconda3/lib/python3.9/site-packages/pyspark/java_gateway.py in launch_gateway(conf, popen_kwargs)
104
105 if not os.path.isfile(conn_info_file):
--> 106 raise RuntimeError("Java gateway process exited before sending its port number")
107
108 with open(conn_info_file, "rb") as info:
RuntimeError: Java gateway process exited before sending its port number