Example usage notebook

import dsa_tdb
from dsa_tdb import TDB_DataFrame

Create a Spark session

spark = dsa_tdb.utils.spark_session_factory()

Load the data

df = TDB_DataFrame(spark=spark)
df.loadData(root_folder="../data/sample_data", platform="global", version="full")
df.head(1)
[Row(uuid='593e02be-69f1-45b9-a387-a7be6258b781', decision_visibility='["DECISION_VISIBILITY_CONTENT_REMOVED"]', decision_visibility_other=None, end_date_visibility_restriction=None, decision_monetary=None, decision_monetary_other=None, end_date_monetary_restriction=None, decision_provision=None, end_date_service_restriction=None, decision_account=None, end_date_account_restriction=None, account_type=None, decision_ground='DECISION_GROUND_INCOMPATIBLE_CONTENT', decision_ground_reference_url=None, illegal_content_legal_ground=None, illegal_content_explanation=None, incompatible_content_ground='Adult content', incompatible_content_explanation='This Pin violates our Community Guidelines on adult content, because it contains sexualization of clothed individuals.', incompatible_content_illegal=None, category='STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT', category_addition=None, category_specification='["KEYWORD_ADULT_SEXUAL_MATERIAL"]', category_specification_other=None, content_type='["CONTENT_TYPE_OTHER"]', content_type_other='Pin', content_language=None, content_date=datetime.datetime(2023, 9, 20, 0, 0), territorial_scope='EEA', application_date=datetime.datetime(2023, 9, 25, 0, 0), decision_facts='We identified this violation through our own investigation. This Pin violates our Community Guidelines on adult content, because it contains sexualization of clothed individuals.', source_type='SOURCE_VOLUNTARY', source_identity=None, automated_detection='No', automated_decision='AUTOMATED_DECISION_PARTIALLY', platform_name='Pinterest', platform_uid='24b09ca5-8c60-4eec-99c3-33a8fad3c4d2', created_at=datetime.datetime(2023, 9, 25, 0, 0))]

Perform a filter

# Keep only the content_type = 'CONTENT_TYPE_IMAGE'
df.filter_SoRs(content_type=["CONTENT_TYPE_IMAGE"])
df.head(1)
[Row(DECISION_VISIBILITY_OTHER=False, KEYWORD_PATENT_INFRINGEMENT=False, STATEMENT_CATEGORY_SCAMS_AND_FRAUD=False, STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT=False, platform_name='TikTok', illegal_content_explanation=None, incompatible_content_ground='Community Guidelines', KEYWORD_CHILD_SEXUAL_ABUSE_MATERIAL=False, DECISION_VISIBILITY_CONTENT_LABELLED=False, uuid='974b0502-c3cc-4ae0-8592-16a721a44e95', KEYWORD_DESIGN_INFRINGEMENT=False, STATEMENT_CATEGORY_VIOLENCE=False, KEYWORD_DISCRIMINATION=False, KEYWORD_GEOGRAPHICAL_REQUIREMENTS=False, territorial_scope='EEA_no_IS', KEYWORD_GENDER_BASED_VIOLENCE=False, CONTENT_TYPE_VIDEO=False, DECISION_VISIBILITY_CONTENT_DEMOTED=False, source_identity=None, KEYWORD_GOODS_SERVICES_NOT_PERMITTED=False, KEYWORD_PHISHING=False, content_type='["CONTENT_TYPE_IMAGE"]', KEYWORD_LANGUAGE_REQUIREMENTS=False, application_date=datetime.datetime(2023, 9, 25, 0, 0), STATEMENT_CATEGORY_NEGATIVE_EFFECTS_ON_CIVIC_DISCOURSE_OR_ELECTIONS=False, KEYWORD_RIGHT_TO_BE_FORGOTTEN=False, account_type=None, KEYWORD_HATE_SPEECH=False, CONTENT_TYPE_AUDIO=False, STATEMENT_CATEGORY_DATA_PROTECTION_AND_PRIVACY_VIOLATIONS=False, STATEMENT_CATEGORY_ANIMAL_WELFARE=False, DECISION_VISIBILITY_CONTENT_INTERACTION_RESTRICTED=False, incompatible_content_explanation='Your content is against our Community Guidelines. The guidelines include rules and standards for using TikTok that apply to everyone and everything on our platform.We proactively enforce our Community Guidelines through a mix of technology and human moderation. We have detected this policy violation using automated measures. We have used automated measures in making this decision.', category_specification=None, KEYWORD_REGULATED_GOODS_SERVICES=False, STATEMENT_CATEGORY_INTELLECTUAL_PROPERTY_INFRINGEMENTS=False, STATEMENT_CATEGORY_RISK_FOR_PUBLIC_SECURITY=False, DECISION_VISIBILITY_CONTENT_REMOVED=True, category='STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE', decision_ground_reference_url=None, KEYWORD_HUMAN_EXPLOITATION=False, KEYWORD_TERRORIST_CONTENT=False, KEYWORD_RISK_PUBLIC_HEALTH=False, illegal_content_legal_ground=None, automated_decision='AUTOMATED_DECISION_FULLY', KEYWORD_INAUTHENTIC_ACCOUNTS=False, KEYWORD_MISINFORMATION=False, STATEMENT_CATEGORY_NON_CONSENSUAL_BEHAVIOUR=False, STATEMENT_CATEGORY_SELF_HARM=False, decision_facts='The decision was taken pursuant to own-initiative investigations.', KEYWORD_GROOMING_SEXUAL_ENTICEMENT_MINORS=False, KEYWORD_GEOGRAPHIC_INDICATIONS_INFRINGEMENT=False, end_date_service_restriction=None, decision_monetary=None, category_specification_other=None, KEYWORD_IMAGE_BASED_SEXUAL_ABUSE=False, KEYWORD_UNLAWFUL_SALE_ANIMALS=False, end_date_monetary_restriction=None, content_date=datetime.datetime(2023, 9, 7, 0, 0), STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE=False, platform_uid='7282762331992431392', KEYWORD_ONLINE_BULLYING_INTIMIDATION=False, KEYWORD_STALKING=False, automated_detection='Yes', KEYWORD_RISK_ENVIRONMENTAL_DAMAGE=False, decision_provision=None, KEYWORD_UNSAFE_CHALLENGES=False, KEYWORD_SUICIDE=False, content_language=None, KEYWORD_COORDINATED_HARM=False, KEYWORD_AGE_SPECIFIC_RESTRICTIONS_MINORS=False, KEYWORD_HUMAN_TRAFFICKING=False, decision_account=None, decision_visibility='["DECISION_VISIBILITY_CONTENT_REMOVED"]', KEYWORD_ADULT_SEXUAL_MATERIAL=False, KEYWORD_SELF_MUTILATION=False, KEYWORD_TRADE_SECRET_INFRINGEMENT=False, STATEMENT_CATEGORY_PROTECTION_OF_MINORS=False, KEYWORD_NUDITY=False, KEYWORD_AGE_SPECIFIC_RESTRICTIONS=False, source_type='SOURCE_VOLUNTARY', KEYWORD_INSUFFICIENT_INFORMATION_TRADERS=False, KEYWORD_COPYRIGHT_INFRINGEMENT=False, incompatible_content_illegal=None, KEYWORD_INCITEMENT_VIOLENCE_HATRED=False, decision_ground='DECISION_GROUND_INCOMPATIBLE_CONTENT', KEYWORD_TRADEMARK_INFRINGEMENT=False, KEYWORD_MISSING_PROCESSING_GROUND=False, CONTENT_TYPE_IMAGE=True, KEYWORD_IMPERSONATION_ACCOUNT_HIJACKING=False, KEYWORD_INAUTHENTIC_LISTINGS=False, category_addition=None, KEYWORD_INAUTHENTIC_USER_REVIEWS=False, STATEMENT_CATEGORY_UNSAFE_AND_ILLEGAL_PRODUCTS=False, created_at=datetime.datetime(2023, 9, 25, 0, 0), decision_visibility_other=None, KEYWORD_PYRAMID_SCHEMES=False, CONTENT_TYPE_APP=False, DECISION_VISIBILITY_CONTENT_AGE_RESTRICTED=False, content_type_other=None, KEYWORD_DEFAMATION=False, KEYWORD_CONTENT_PROMOTING_EATING_DISORDERS=False, KEYWORD_DANGEROUS_TOYS=False, CONTENT_TYPE_SYNTHETIC_MEDIA=False, CONTENT_TYPE_TEXT=False, end_date_account_restriction=None, DECISION_VISIBILITY_CONTENT_DISABLED=False, CONTENT_TYPE_PRODUCT=False, KEYWORD_FOREIGN_INFORMATION_MANIPULATION=False, decision_monetary_other=None, STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH=False, KEYWORD_BIOMETRIC_DATA_BREACH=False, KEYWORD_DATA_FALSIFICATION=False, KEYWORD_NON_CONSENSUAL_ITEMS_DEEPFAKE=False, KEYWORD_OTHER=False, KEYWORD_ILLEGAL_ORGANIZATIONS=False, end_date_visibility_restriction=None, CONTENT_TYPE_OTHER=False, KEYWORD_NON_CONSENSUAL_IMAGE_SHARING=False, KEYWORD_DISINFORMATION=False, KEYWORD_ANIMAL_HARM=False)]
df.columns
['DECISION_VISIBILITY_OTHER',
 'KEYWORD_PATENT_INFRINGEMENT',
 'STATEMENT_CATEGORY_SCAMS_AND_FRAUD',
 'STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT',
 'platform_name',
 'illegal_content_explanation',
 'incompatible_content_ground',
 'KEYWORD_CHILD_SEXUAL_ABUSE_MATERIAL',
 'DECISION_VISIBILITY_CONTENT_LABELLED',
 'uuid',
 'KEYWORD_DESIGN_INFRINGEMENT',
 'STATEMENT_CATEGORY_VIOLENCE',
 'KEYWORD_DISCRIMINATION',
 'KEYWORD_GEOGRAPHICAL_REQUIREMENTS',
 'territorial_scope',
 'KEYWORD_GENDER_BASED_VIOLENCE',
 'CONTENT_TYPE_VIDEO',
 'DECISION_VISIBILITY_CONTENT_DEMOTED',
 'source_identity',
 'KEYWORD_GOODS_SERVICES_NOT_PERMITTED',
 'KEYWORD_PHISHING',
 'content_type',
 'KEYWORD_LANGUAGE_REQUIREMENTS',
 'application_date',
 'STATEMENT_CATEGORY_NEGATIVE_EFFECTS_ON_CIVIC_DISCOURSE_OR_ELECTIONS',
 'KEYWORD_RIGHT_TO_BE_FORGOTTEN',
 'account_type',
 'KEYWORD_HATE_SPEECH',
 'CONTENT_TYPE_AUDIO',
 'STATEMENT_CATEGORY_DATA_PROTECTION_AND_PRIVACY_VIOLATIONS',
 'STATEMENT_CATEGORY_ANIMAL_WELFARE',
 'DECISION_VISIBILITY_CONTENT_INTERACTION_RESTRICTED',
 'incompatible_content_explanation',
 'category_specification',
 'KEYWORD_REGULATED_GOODS_SERVICES',
 'STATEMENT_CATEGORY_INTELLECTUAL_PROPERTY_INFRINGEMENTS',
 'STATEMENT_CATEGORY_RISK_FOR_PUBLIC_SECURITY',
 'DECISION_VISIBILITY_CONTENT_REMOVED',
 'category',
 'decision_ground_reference_url',
 'KEYWORD_HUMAN_EXPLOITATION',
 'KEYWORD_TERRORIST_CONTENT',
 'KEYWORD_RISK_PUBLIC_HEALTH',
 'illegal_content_legal_ground',
 'automated_decision',
 'KEYWORD_INAUTHENTIC_ACCOUNTS',
 'KEYWORD_MISINFORMATION',
 'STATEMENT_CATEGORY_NON_CONSENSUAL_BEHAVIOUR',
 'STATEMENT_CATEGORY_SELF_HARM',
 'decision_facts',
 'KEYWORD_GROOMING_SEXUAL_ENTICEMENT_MINORS',
 'KEYWORD_GEOGRAPHIC_INDICATIONS_INFRINGEMENT',
 'end_date_service_restriction',
 'decision_monetary',
 'category_specification_other',
 'KEYWORD_IMAGE_BASED_SEXUAL_ABUSE',
 'KEYWORD_UNLAWFUL_SALE_ANIMALS',
 'end_date_monetary_restriction',
 'content_date',
 'STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE',
 'platform_uid',
 'KEYWORD_ONLINE_BULLYING_INTIMIDATION',
 'KEYWORD_STALKING',
 'automated_detection',
 'KEYWORD_RISK_ENVIRONMENTAL_DAMAGE',
 'decision_provision',
 'KEYWORD_UNSAFE_CHALLENGES',
 'KEYWORD_SUICIDE',
 'content_language',
 'KEYWORD_COORDINATED_HARM',
 'KEYWORD_AGE_SPECIFIC_RESTRICTIONS_MINORS',
 'KEYWORD_HUMAN_TRAFFICKING',
 'decision_account',
 'decision_visibility',
 'KEYWORD_ADULT_SEXUAL_MATERIAL',
 'KEYWORD_SELF_MUTILATION',
 'KEYWORD_TRADE_SECRET_INFRINGEMENT',
 'STATEMENT_CATEGORY_PROTECTION_OF_MINORS',
 'KEYWORD_NUDITY',
 'KEYWORD_AGE_SPECIFIC_RESTRICTIONS',
 'source_type',
 'KEYWORD_INSUFFICIENT_INFORMATION_TRADERS',
 'KEYWORD_COPYRIGHT_INFRINGEMENT',
 'incompatible_content_illegal',
 'KEYWORD_INCITEMENT_VIOLENCE_HATRED',
 'decision_ground',
 'KEYWORD_TRADEMARK_INFRINGEMENT',
 'KEYWORD_MISSING_PROCESSING_GROUND',
 'CONTENT_TYPE_IMAGE',
 'KEYWORD_IMPERSONATION_ACCOUNT_HIJACKING',
 'KEYWORD_INAUTHENTIC_LISTINGS',
 'category_addition',
 'KEYWORD_INAUTHENTIC_USER_REVIEWS',
 'STATEMENT_CATEGORY_UNSAFE_AND_ILLEGAL_PRODUCTS',
 'created_at',
 'decision_visibility_other',
 'KEYWORD_PYRAMID_SCHEMES',
 'CONTENT_TYPE_APP',
 'DECISION_VISIBILITY_CONTENT_AGE_RESTRICTED',
 'content_type_other',
 'KEYWORD_DEFAMATION',
 'KEYWORD_CONTENT_PROMOTING_EATING_DISORDERS',
 'KEYWORD_DANGEROUS_TOYS',
 'CONTENT_TYPE_SYNTHETIC_MEDIA',
 'CONTENT_TYPE_TEXT',
 'end_date_account_restriction',
 'DECISION_VISIBILITY_CONTENT_DISABLED',
 'CONTENT_TYPE_PRODUCT',
 'KEYWORD_FOREIGN_INFORMATION_MANIPULATION',
 'decision_monetary_other',
 'STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH',
 'KEYWORD_BIOMETRIC_DATA_BREACH',
 'KEYWORD_DATA_FALSIFICATION',
 'KEYWORD_NON_CONSENSUAL_ITEMS_DEEPFAKE',
 'KEYWORD_OTHER',
 'KEYWORD_ILLEGAL_ORGANIZATIONS',
 'end_date_visibility_restriction',
 'CONTENT_TYPE_OTHER',
 'KEYWORD_NON_CONSENSUAL_IMAGE_SHARING',
 'KEYWORD_DISINFORMATION',
 'KEYWORD_ANIMAL_HARM']

Aggregate by content_date, platform_name and category

df.aggregate_SoRs(columns_to_group=["content_date", "platform_name", "category"])

# Port to pandas
df_pd = df.toPandas()

df_pd.head(5)
content_date platform_name category count
0 2023-09-05 TikTok STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE 441
1 2023-06-27 TikTok STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE 91
2 2023-09-24 TikTok STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH 63
3 2023-04-30 TikTok STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE 25
4 2023-07-05 TikTok STATEMENT_CATEGORY_SCAMS_AND_FRAUD 1

Count the number of content_type image entries for each platform

df_pd.groupby("platform_name")["count"].sum()
platform_name
Google Maps     6984
Snapchat         676
TikTok         48221
Name: count, dtype: int64