Accessing and querying datasets with Python

I am trying to reproduce the code as the Accessing and querying datasets with Python.

from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# path to ClinVar (EVA) evidence dataset 
# directory stored on your local machine
evidencePath = "local directory path - e.g. /User/downloads/sourceId=eva"

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

# read evidence dataset
evd = spark.read.parquet(evidencePath)

# Browse the evidence schema
evd.printSchema()

# select fields of interest
evdSelect = (evd
 .select("targetId",
         "diseaseId",
         "variantRsId",
         "studyId",
         F.explode("clinicalSignificances").alias("cs"),
         "confidence")
 )
 evdSelect.show()

# +---------------+--------------+-----------+------------+--------------------+--------------------+
# |       targetId|     diseaseId|variantRsId|     studyId|                  cs|          confidence|
# +---------------+--------------+-----------+------------+--------------------+--------------------+
# |ENSG00000153201|Orphanet_88619|rs773278648|RCV001042548|uncertain signifi...|criteria provided...|
# |ENSG00000115718|  Orphanet_745|       null|RCV001134697|uncertain signifi...|criteria provided...|
# |ENSG00000107147|    HP_0001250|rs539139475|RCV000720408|       likely benign|criteria provided...|
# |ENSG00000175426|Orphanet_71528|rs142567487|RCV000292648|uncertain signifi...|criteria provided...|
# |ENSG00000169174|   EFO_0004911|rs563024336|RCV000375546|uncertain signifi...|criteria provided...|
# |ENSG00000140521|  Orphanet_298|rs376306906|RCV000763992|uncertain signifi...|criteria provided...|
# |ENSG00000134982|   EFO_0005842| rs74627407|RCV000073743|               other|no assertion crit...|
# |ENSG00000187498| MONDO_0008289|rs146288748|RCV001111533|uncertain signifi...|criteria provided...|
# |ENSG00000116688|Orphanet_64749|rs119103265|RCV000857104|uncertain signifi...|no assertion crit...|
# |ENSG00000133812|Orphanet_99956|rs562275980|RCV000367609|uncertain signifi...|criteria provided...|
# +---------------+--------------+-----------+------------+--------------------+--------------------+
# only showing top 10 rows

# Convert to a Pandas Dataframe
evdSelect.toPandas()

But I got this error somehow. Could anyone help me to solve this issue and tell me what is going wrong?

from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# path to ClinVar (EVA) evidence dataset 
# directory stored on your local machine
evidencePath = "/User/xxxx/Desktop/sourceId=eva"

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

# read evidence dataset
evd = spark.read.parquet(evidencePath)

# Browse the evidence schema
evd.printSchema()

# select fields of interest
evdSelect = (evd
 .select("targetId",
         "diseaseId",
         "variantRsId",
         "studyId",
         F.explode("clinicalSignificances").alias("cs"),
         "confidence")
 )
evdSelect.show()
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/var/folders/sg/84_1tjdd0z96ls5mpw9jp4hc0000gn/T/ipykernel_13805/3028577531.py in <module>
      9 # establish spark connection
     10 spark = (
---> 11     SparkSession.builder
     12     .master('local[*]')
     13     .getOrCreate()

~/opt/anaconda3/lib/python3.9/site-packages/pyspark/sql/session.py in getOrCreate(self)
    267                         sparkConf.set(key, value)
    268                     # This SparkContext may be an existing one.
--> 269                     sc = SparkContext.getOrCreate(sparkConf)
    270                     # Do not update `SparkConf` for existing `SparkContext`, as it's shared
    271                     # by all sessions.

~/opt/anaconda3/lib/python3.9/site-packages/pyspark/context.py in getOrCreate(cls, conf)
    481         with SparkContext._lock:
    482             if SparkContext._active_spark_context is None:
--> 483                 SparkContext(conf=conf or SparkConf())
    484             assert SparkContext._active_spark_context is not None
    485             return SparkContext._active_spark_context

~/opt/anaconda3/lib/python3.9/site-packages/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
    193             )
    194 
--> 195         SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    196         try:
    197             self._do_init(

~/opt/anaconda3/lib/python3.9/site-packages/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
    415         with SparkContext._lock:
    416             if not SparkContext._gateway:
--> 417                 SparkContext._gateway = gateway or launch_gateway(conf)
    418                 SparkContext._jvm = SparkContext._gateway.jvm
    419 

~/opt/anaconda3/lib/python3.9/site-packages/pyspark/java_gateway.py in launch_gateway(conf, popen_kwargs)
    104 
    105             if not os.path.isfile(conn_info_file):
--> 106                 raise RuntimeError("Java gateway process exited before sending its port number")
    107 
    108             with open(conn_info_file, "rb") as info:

RuntimeError: Java gateway process exited before sending its port number

Hi,

Welcome to our community portal! The observed exception is happening prior reading the files or doing any operation on the data. What goes wrong there is that the Python could not establish connection with the underlying JAVA layer. Unfortunately it can have multiple underlying reason why this is happening. Please take a look at this stackoverflow question and try to narrow down the issue further.

First of all you need to have a working java installation and have JAVA_HOME set in your conda environment. This is a reasonable starting point. Please try to troubleshoot and get back to us if you are stuck.

Good luck!
Best,
Daniel

1 Like