import requests
import csv
def get_missing_labels(limit=10000):
# Define the Wikidata endpoint and SPARQL query
endpoint = "https://query.wikidata.org/sparql"
query = f"""
SELECT ?item ?itemLabel
WHERE {{
?item wdt:P31 wd:Q202444; # Instance of "male given name"
schema:description ?description.
FILTER(LANG(?description) = "en") # English description
FILTER NOT EXISTS {{
?item rdfs:label ?malayalamLabel.
FILTER(LANG(?malayalamLabel) = "ml") # Malayalam label does not exist
}}
}}
LIMIT {limit}
"""
# Set headers for the request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/json',
'Content-Type': 'application/json',
}
# Set parameters for the request
params = {
'query': query,
'format': 'json'
}
# Make the request to the Wikidata API
response = requests.get(endpoint, headers=headers, params=params)
# Check if the request was successful (status code 200)
if response.status_code == 200:
data = response.json().get('results', {}).get('bindings', [])
# Create a list to store the results
results = []
for item in data:
entity_id = item.get('item', {}).get('value', '').split('/')[-1]
english_label = item.get('itemLabel', {}).get('value', '')
results.append({'entity_id': entity_id, 'english_label': english_label})
return results
else:
print(f"Failed to retrieve data. Status code: {response.status_code}")
return None
def write_to_csv(data, filename='missing_labels_male_given_name.csv'):
# Write the results to a CSV file
with open(filename, 'w', newline='', encoding='utf-8') as csv_file:
fieldnames = ['entity_id', 'english_label']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
if __name__ == "__main__":
# Run the function to get items with missing labels
results = get_missing_labels(limit=10000)
# Check if there are results
if results:
# Print the results
print("Items with English description 'male given name' lacking Malayalam label (limited to 10000 results):")
for result in results:
print(f"{result['entity_id']}: {result['english_label']}")
# Write the results to a CSV file
write_to_csv(results, filename='missing_labels_male_given_name.csv')
print("\nResults have been saved to missing_labels_male_given_name.csv.")
else:
print("No results obtained.")