Issue getting data with the GraphQL API using Python

Hi,

I am new to the Open Targets Platform’s GraphQL API. When I successfully construct the following query in the playground:

query target {
  target(ensemblId: "ENSG00000091831"){
    tractability{
      modality
    }
    associatedDiseases{
      rows{
        score
        datatypeScores{
          score
        }
        disease {
          id
          name
          directLocationIds
          therapeuticAreas{
            id
            name
          }
          evidences(ensemblIds: "ENSG00000091831"){
            count
            rows{
              drug{
                id
                name
                drugType
              }
              datatypeId
              reactionName
              pathways{
                id
                name
              }
              resourceScore
              clinicalPhase
            }
          }
        }
      }
    }
  }
}

After this, I am trying to use it in python, but it does’t work. I can’t figure out why it’s reporting an error.
Could you please help me out? Thanks a lot

The code is as follows:

#!/usr/bin/env python3

# Import relevant libraries for HTTP request and JSON formatting
import requests
import json
from traceback import print_exc
from time import sleep

def get_associations(target):
    # Make one type of OpenTargets query. 
    # (This function exists for error-handling purposes).
    query_string = """
    query target($ensemblId: String!){
        target(ensemblId: $ensemblId){
            tractability{
                modality
            }
            associatedDiseases{
                rows{
                    score
                    datatypeScores{
                        score
                    }
                    disease {
                        id
                        name
                        directLocationIds
                        therapeuticAreas{
                            id
                            name
                        }
                        evidences(ensemblIds: $ensemblId){
                            count
                            rows{
                                drug{
                                    id
                                    name
                                    drugType
                                }
                                datatypeId
                                reactionName
                                pathways{
                                    id
                                    name
                                }
                                resourceScore
                                clinicalPhase
                            }
                        }
                    }
                }
            }        
        }
    }
    """

    variables = {"ensemblId": target['GeneID']}

    # Set base URL of GraphQL API endpoint
    base_url = "https://api.platform.opentargets.org/api/v4/graphql"
           
    try:
        r = requests.post(base_url, json={"query": query_string, "variables": variables})
        assocs = json.loads(r.text).get('data').get('target')
    except Exception as e:
        print("Exception in Disease Association function, pt1:")
        print(e)
        print_exc()
        print("Retrying in 10 seconds...")
        sleep(10)
        assocs = get_associations(target)
    return assocs

target = {'GeneID':"ENSG00000091831"}
assocs = get_associations(target)

This question was sent to the Open Targets Helpdesk and has been anonymised.

Hello,

I have reviewed the Python script and the reason for the error message relates to the arguments passed in the evidences field portion of the query string.

The evidences field takes an array of Ensembl IDs – ensemblIds – as an argument. And so you will need to encapsulate that $ensemblId in square brackets.

Please see below for a simplified version of your script that includes evidences(ensemblIds: [$ensemblId]):

def get_associations(target):
    # Make one type of OpenTargets query. 
    # (This function exists for error-handling purposes).
    query_string = """
        query target($ensemblId: String!){
            target(ensemblId: $ensemblId){
                    associatedDiseases{
                        rows{
                            score
                            disease {
                                id
                                name
                                evidences(ensemblIds: [$ensemblId]){
                                    count
                                }
                            }
                        }
                    }
                }
            }
        """

    variables = {"ensemblId": target['GeneID']}

    # Set base URL of GraphQL API endpoint
    base_url = "https://api.platform.opentargets.org/api/v4/graphql"
           
    try:
        r = requests.post(base_url, json={"query": query_string, "variables": variables})
        assocs = json.loads(r.text).get('data').get('target')
    except Exception as e:
        print("Exception in Disease Association function, pt1:")
        print(e)
        print_exc()
        print("Retrying in 10 seconds...")
        sleep(10)
        assocs = get_associations(target)
    return assocs

target = {'GeneID':"ENSG00000091831"}

assocs = get_associations(target)

Upon reviewing the script that you submitted, I would strongly advise you to use our dataset downloads available in either Parquet or JSON format:

http://ftp.ebi.ac.uk/pub/databases/opentargets/platform/latest/output/etl/

Using our dataset downloads will give you the ability to specify a specific data release version (e.g. 21.11. ), select whether you want to use our direct or indirect associations, and parse the evidence data for the sources that you are interested in. Based on your script, you are querying for both Reactome and ChEMBL evidence and you could use those evidence datasets to query for all relevant fields.

thanks a lot, and I will do as your said.

hello,
thanks again for your kind help and I have followed your advice and used the dataset download and selecting interest data via pyspark. here is my code:

"""
# import relevant libraries
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd

# create Spark session
spark = (
    SparkSession.builder
    .master('local[*]')    
    .config('spark.executor.memory', '2g') 
    .config('spark.driver.memory', '1g')
		.config("spark.executor.cores", '4')
    .config('spark.cores.max', '8')  
    .config('spark.default.parallelism', '16')
		.config('spark.logConf', True)
		.config('spark.network.timeout', 300)
		.getOrCreate()
)

# set location of dataset (Parquet format)
drug_data_path = "Data/opentarget/molecule"
targets_data_path = "Data/opentarget/targets"
disease_data_path = "Data/opentarget/diseases"
evidence_data_path = "Data/opentarget/evidence"
associations_direct_data_path = "Data/opentarget/associationByOverallDirect"
associations_indirect_data_path = "Data/opentarget/associationByOverallIndirect"

# read dataset
drug_data = spark.read.parquet(drug_data_path)
targets_data = spark.read.parquet(targets_data_path)
disease_data = spark.read.parquet(disease_data_path)
evidence_data = spark.read.parquet(evidence_data_path)
associations_direct_data = spark.read.parquet(associations_direct_data_path)
associations_indirect_data = spark.read.parquet(associations_indirect_data_path)


# create subset with relevant fields
drug_data_subset = (drug_data.select(F.col("id").alias("drugId")))


targets_data_subset = (targets_data.select(
						F.col("id").alias("targetId"),
						F.col("tractability.modality"), 
						F.col("pathways.pathway"))
					   )

disease_data_subset = (disease_data.select(
						F.col("id").alias("diseaseId"),
						F.col("name").alias("diseaseLabel"),
						F.col("directLocationIds"),
						F.col("therapeuticAreas"))
					   )

evidence_data_subset = (evidence_data.select(
						"diseaseId", "drugId", "literature",
						F.col("clinicalPhase").alias("Max clinical phase"),
						F.col("datatypeId").alias("Datatype"),
						F.col("resourceScore").alias("Resource Score"))
						)

associations_direct_data_subset = (associations_direct_data.select(
									"targetId","diseaseId",
									 F.col("score").alias("overallAssociationScore"))
								   )

associations_indirect_data_subset = (associations_indirect_data.select(
									   "targetId","diseaseId", 
									    F.col("score").alias("overallAssociationScore"))
								     )



# merge associations and diseases data
output_direct = (associations_direct_data_subset \
				.join(evidence_data_subset, on="diseaseId", how="inner") \
				.join(disease_data_subset, on="diseaseId", how="inner")	 \
				.join(drug_data_subset, on="drugId", how="full"))



query_id = 'ENSG00000105974'
query_result = output_direct.filter(output_direct.targetId == query_id)
query_result_df = query_result.toPandas()

all things are fine until I try to merge the subset, chould you please help me out again? thanks a lot

Hi recherHE

It’s not clear to me exactly why is going wrong without seeing a stacktrace with the error. Spark is evaluated lazily so it’s also not guaranteed that the error you are seeing is being caused by the merge statement, and not something further up the chain.

I’d also point out that you’re using very large data sets on a local set-up. You’re almost certainly going to get out of memory issues without using a cluster.