Example usage notebook
import dsa_tdb
from dsa_tdb import TDB_DataFrame
Create a Spark session
spark = dsa_tdb.utils.spark_session_factory()
Load the data
df = TDB_DataFrame(spark=spark)
df.loadData(root_folder="../data/sample_data", platform="global", version="full")
df.head(1)
[Row(uuid='593e02be-69f1-45b9-a387-a7be6258b781', decision_visibility='["DECISION_VISIBILITY_CONTENT_REMOVED"]', decision_visibility_other=None, end_date_visibility_restriction=None, decision_monetary=None, decision_monetary_other=None, end_date_monetary_restriction=None, decision_provision=None, end_date_service_restriction=None, decision_account=None, end_date_account_restriction=None, account_type=None, decision_ground='DECISION_GROUND_INCOMPATIBLE_CONTENT', decision_ground_reference_url=None, illegal_content_legal_ground=None, illegal_content_explanation=None, incompatible_content_ground='Adult content', incompatible_content_explanation='This Pin violates our Community Guidelines on adult content, because it contains sexualization of clothed individuals.', incompatible_content_illegal=None, category='STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT', category_addition=None, category_specification='["KEYWORD_ADULT_SEXUAL_MATERIAL"]', category_specification_other=None, content_type='["CONTENT_TYPE_OTHER"]', content_type_other='Pin', content_language=None, content_date=datetime.datetime(2023, 9, 20, 0, 0), territorial_scope='EEA', application_date=datetime.datetime(2023, 9, 25, 0, 0), decision_facts='We identified this violation through our own investigation. This Pin violates our Community Guidelines on adult content, because it contains sexualization of clothed individuals.', source_type='SOURCE_VOLUNTARY', source_identity=None, automated_detection='No', automated_decision='AUTOMATED_DECISION_PARTIALLY', platform_name='Pinterest', platform_uid='24b09ca5-8c60-4eec-99c3-33a8fad3c4d2', created_at=datetime.datetime(2023, 9, 25, 0, 0))]
Perform a filter
# Keep only the content_type = 'CONTENT_TYPE_IMAGE'
df.filter_SoRs(content_type=["CONTENT_TYPE_IMAGE"])
df.head(1)
[Row(DECISION_VISIBILITY_OTHER=False, KEYWORD_PATENT_INFRINGEMENT=False, STATEMENT_CATEGORY_SCAMS_AND_FRAUD=False, STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT=False, platform_name='TikTok', illegal_content_explanation=None, incompatible_content_ground='Community Guidelines', KEYWORD_CHILD_SEXUAL_ABUSE_MATERIAL=False, DECISION_VISIBILITY_CONTENT_LABELLED=False, uuid='974b0502-c3cc-4ae0-8592-16a721a44e95', KEYWORD_DESIGN_INFRINGEMENT=False, STATEMENT_CATEGORY_VIOLENCE=False, KEYWORD_DISCRIMINATION=False, KEYWORD_GEOGRAPHICAL_REQUIREMENTS=False, territorial_scope='EEA_no_IS', KEYWORD_GENDER_BASED_VIOLENCE=False, CONTENT_TYPE_VIDEO=False, DECISION_VISIBILITY_CONTENT_DEMOTED=False, source_identity=None, KEYWORD_GOODS_SERVICES_NOT_PERMITTED=False, KEYWORD_PHISHING=False, content_type='["CONTENT_TYPE_IMAGE"]', KEYWORD_LANGUAGE_REQUIREMENTS=False, application_date=datetime.datetime(2023, 9, 25, 0, 0), STATEMENT_CATEGORY_NEGATIVE_EFFECTS_ON_CIVIC_DISCOURSE_OR_ELECTIONS=False, KEYWORD_RIGHT_TO_BE_FORGOTTEN=False, account_type=None, KEYWORD_HATE_SPEECH=False, CONTENT_TYPE_AUDIO=False, STATEMENT_CATEGORY_DATA_PROTECTION_AND_PRIVACY_VIOLATIONS=False, STATEMENT_CATEGORY_ANIMAL_WELFARE=False, DECISION_VISIBILITY_CONTENT_INTERACTION_RESTRICTED=False, incompatible_content_explanation='Your content is against our Community Guidelines. The guidelines include rules and standards for using TikTok that apply to everyone and everything on our platform.We proactively enforce our Community Guidelines through a mix of technology and human moderation. We have detected this policy violation using automated measures. We have used automated measures in making this decision.', category_specification=None, KEYWORD_REGULATED_GOODS_SERVICES=False, STATEMENT_CATEGORY_INTELLECTUAL_PROPERTY_INFRINGEMENTS=False, STATEMENT_CATEGORY_RISK_FOR_PUBLIC_SECURITY=False, DECISION_VISIBILITY_CONTENT_REMOVED=True, category='STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE', decision_ground_reference_url=None, KEYWORD_HUMAN_EXPLOITATION=False, KEYWORD_TERRORIST_CONTENT=False, KEYWORD_RISK_PUBLIC_HEALTH=False, illegal_content_legal_ground=None, automated_decision='AUTOMATED_DECISION_FULLY', KEYWORD_INAUTHENTIC_ACCOUNTS=False, KEYWORD_MISINFORMATION=False, STATEMENT_CATEGORY_NON_CONSENSUAL_BEHAVIOUR=False, STATEMENT_CATEGORY_SELF_HARM=False, decision_facts='The decision was taken pursuant to own-initiative investigations.', KEYWORD_GROOMING_SEXUAL_ENTICEMENT_MINORS=False, KEYWORD_GEOGRAPHIC_INDICATIONS_INFRINGEMENT=False, end_date_service_restriction=None, decision_monetary=None, category_specification_other=None, KEYWORD_IMAGE_BASED_SEXUAL_ABUSE=False, KEYWORD_UNLAWFUL_SALE_ANIMALS=False, end_date_monetary_restriction=None, content_date=datetime.datetime(2023, 9, 7, 0, 0), STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE=False, platform_uid='7282762331992431392', KEYWORD_ONLINE_BULLYING_INTIMIDATION=False, KEYWORD_STALKING=False, automated_detection='Yes', KEYWORD_RISK_ENVIRONMENTAL_DAMAGE=False, decision_provision=None, KEYWORD_UNSAFE_CHALLENGES=False, KEYWORD_SUICIDE=False, content_language=None, KEYWORD_COORDINATED_HARM=False, KEYWORD_AGE_SPECIFIC_RESTRICTIONS_MINORS=False, KEYWORD_HUMAN_TRAFFICKING=False, decision_account=None, decision_visibility='["DECISION_VISIBILITY_CONTENT_REMOVED"]', KEYWORD_ADULT_SEXUAL_MATERIAL=False, KEYWORD_SELF_MUTILATION=False, KEYWORD_TRADE_SECRET_INFRINGEMENT=False, STATEMENT_CATEGORY_PROTECTION_OF_MINORS=False, KEYWORD_NUDITY=False, KEYWORD_AGE_SPECIFIC_RESTRICTIONS=False, source_type='SOURCE_VOLUNTARY', KEYWORD_INSUFFICIENT_INFORMATION_TRADERS=False, KEYWORD_COPYRIGHT_INFRINGEMENT=False, incompatible_content_illegal=None, KEYWORD_INCITEMENT_VIOLENCE_HATRED=False, decision_ground='DECISION_GROUND_INCOMPATIBLE_CONTENT', KEYWORD_TRADEMARK_INFRINGEMENT=False, KEYWORD_MISSING_PROCESSING_GROUND=False, CONTENT_TYPE_IMAGE=True, KEYWORD_IMPERSONATION_ACCOUNT_HIJACKING=False, KEYWORD_INAUTHENTIC_LISTINGS=False, category_addition=None, KEYWORD_INAUTHENTIC_USER_REVIEWS=False, STATEMENT_CATEGORY_UNSAFE_AND_ILLEGAL_PRODUCTS=False, created_at=datetime.datetime(2023, 9, 25, 0, 0), decision_visibility_other=None, KEYWORD_PYRAMID_SCHEMES=False, CONTENT_TYPE_APP=False, DECISION_VISIBILITY_CONTENT_AGE_RESTRICTED=False, content_type_other=None, KEYWORD_DEFAMATION=False, KEYWORD_CONTENT_PROMOTING_EATING_DISORDERS=False, KEYWORD_DANGEROUS_TOYS=False, CONTENT_TYPE_SYNTHETIC_MEDIA=False, CONTENT_TYPE_TEXT=False, end_date_account_restriction=None, DECISION_VISIBILITY_CONTENT_DISABLED=False, CONTENT_TYPE_PRODUCT=False, KEYWORD_FOREIGN_INFORMATION_MANIPULATION=False, decision_monetary_other=None, STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH=False, KEYWORD_BIOMETRIC_DATA_BREACH=False, KEYWORD_DATA_FALSIFICATION=False, KEYWORD_NON_CONSENSUAL_ITEMS_DEEPFAKE=False, KEYWORD_OTHER=False, KEYWORD_ILLEGAL_ORGANIZATIONS=False, end_date_visibility_restriction=None, CONTENT_TYPE_OTHER=False, KEYWORD_NON_CONSENSUAL_IMAGE_SHARING=False, KEYWORD_DISINFORMATION=False, KEYWORD_ANIMAL_HARM=False)]
df.columns
['DECISION_VISIBILITY_OTHER',
'KEYWORD_PATENT_INFRINGEMENT',
'STATEMENT_CATEGORY_SCAMS_AND_FRAUD',
'STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT',
'platform_name',
'illegal_content_explanation',
'incompatible_content_ground',
'KEYWORD_CHILD_SEXUAL_ABUSE_MATERIAL',
'DECISION_VISIBILITY_CONTENT_LABELLED',
'uuid',
'KEYWORD_DESIGN_INFRINGEMENT',
'STATEMENT_CATEGORY_VIOLENCE',
'KEYWORD_DISCRIMINATION',
'KEYWORD_GEOGRAPHICAL_REQUIREMENTS',
'territorial_scope',
'KEYWORD_GENDER_BASED_VIOLENCE',
'CONTENT_TYPE_VIDEO',
'DECISION_VISIBILITY_CONTENT_DEMOTED',
'source_identity',
'KEYWORD_GOODS_SERVICES_NOT_PERMITTED',
'KEYWORD_PHISHING',
'content_type',
'KEYWORD_LANGUAGE_REQUIREMENTS',
'application_date',
'STATEMENT_CATEGORY_NEGATIVE_EFFECTS_ON_CIVIC_DISCOURSE_OR_ELECTIONS',
'KEYWORD_RIGHT_TO_BE_FORGOTTEN',
'account_type',
'KEYWORD_HATE_SPEECH',
'CONTENT_TYPE_AUDIO',
'STATEMENT_CATEGORY_DATA_PROTECTION_AND_PRIVACY_VIOLATIONS',
'STATEMENT_CATEGORY_ANIMAL_WELFARE',
'DECISION_VISIBILITY_CONTENT_INTERACTION_RESTRICTED',
'incompatible_content_explanation',
'category_specification',
'KEYWORD_REGULATED_GOODS_SERVICES',
'STATEMENT_CATEGORY_INTELLECTUAL_PROPERTY_INFRINGEMENTS',
'STATEMENT_CATEGORY_RISK_FOR_PUBLIC_SECURITY',
'DECISION_VISIBILITY_CONTENT_REMOVED',
'category',
'decision_ground_reference_url',
'KEYWORD_HUMAN_EXPLOITATION',
'KEYWORD_TERRORIST_CONTENT',
'KEYWORD_RISK_PUBLIC_HEALTH',
'illegal_content_legal_ground',
'automated_decision',
'KEYWORD_INAUTHENTIC_ACCOUNTS',
'KEYWORD_MISINFORMATION',
'STATEMENT_CATEGORY_NON_CONSENSUAL_BEHAVIOUR',
'STATEMENT_CATEGORY_SELF_HARM',
'decision_facts',
'KEYWORD_GROOMING_SEXUAL_ENTICEMENT_MINORS',
'KEYWORD_GEOGRAPHIC_INDICATIONS_INFRINGEMENT',
'end_date_service_restriction',
'decision_monetary',
'category_specification_other',
'KEYWORD_IMAGE_BASED_SEXUAL_ABUSE',
'KEYWORD_UNLAWFUL_SALE_ANIMALS',
'end_date_monetary_restriction',
'content_date',
'STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE',
'platform_uid',
'KEYWORD_ONLINE_BULLYING_INTIMIDATION',
'KEYWORD_STALKING',
'automated_detection',
'KEYWORD_RISK_ENVIRONMENTAL_DAMAGE',
'decision_provision',
'KEYWORD_UNSAFE_CHALLENGES',
'KEYWORD_SUICIDE',
'content_language',
'KEYWORD_COORDINATED_HARM',
'KEYWORD_AGE_SPECIFIC_RESTRICTIONS_MINORS',
'KEYWORD_HUMAN_TRAFFICKING',
'decision_account',
'decision_visibility',
'KEYWORD_ADULT_SEXUAL_MATERIAL',
'KEYWORD_SELF_MUTILATION',
'KEYWORD_TRADE_SECRET_INFRINGEMENT',
'STATEMENT_CATEGORY_PROTECTION_OF_MINORS',
'KEYWORD_NUDITY',
'KEYWORD_AGE_SPECIFIC_RESTRICTIONS',
'source_type',
'KEYWORD_INSUFFICIENT_INFORMATION_TRADERS',
'KEYWORD_COPYRIGHT_INFRINGEMENT',
'incompatible_content_illegal',
'KEYWORD_INCITEMENT_VIOLENCE_HATRED',
'decision_ground',
'KEYWORD_TRADEMARK_INFRINGEMENT',
'KEYWORD_MISSING_PROCESSING_GROUND',
'CONTENT_TYPE_IMAGE',
'KEYWORD_IMPERSONATION_ACCOUNT_HIJACKING',
'KEYWORD_INAUTHENTIC_LISTINGS',
'category_addition',
'KEYWORD_INAUTHENTIC_USER_REVIEWS',
'STATEMENT_CATEGORY_UNSAFE_AND_ILLEGAL_PRODUCTS',
'created_at',
'decision_visibility_other',
'KEYWORD_PYRAMID_SCHEMES',
'CONTENT_TYPE_APP',
'DECISION_VISIBILITY_CONTENT_AGE_RESTRICTED',
'content_type_other',
'KEYWORD_DEFAMATION',
'KEYWORD_CONTENT_PROMOTING_EATING_DISORDERS',
'KEYWORD_DANGEROUS_TOYS',
'CONTENT_TYPE_SYNTHETIC_MEDIA',
'CONTENT_TYPE_TEXT',
'end_date_account_restriction',
'DECISION_VISIBILITY_CONTENT_DISABLED',
'CONTENT_TYPE_PRODUCT',
'KEYWORD_FOREIGN_INFORMATION_MANIPULATION',
'decision_monetary_other',
'STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH',
'KEYWORD_BIOMETRIC_DATA_BREACH',
'KEYWORD_DATA_FALSIFICATION',
'KEYWORD_NON_CONSENSUAL_ITEMS_DEEPFAKE',
'KEYWORD_OTHER',
'KEYWORD_ILLEGAL_ORGANIZATIONS',
'end_date_visibility_restriction',
'CONTENT_TYPE_OTHER',
'KEYWORD_NON_CONSENSUAL_IMAGE_SHARING',
'KEYWORD_DISINFORMATION',
'KEYWORD_ANIMAL_HARM']
Aggregate by content_date
, platform_name
and category
df.aggregate_SoRs(columns_to_group=["content_date", "platform_name", "category"])
# Port to pandas
df_pd = df.toPandas()
df_pd.head(5)
content_date | platform_name | category | count | |
---|---|---|---|---|
0 | 2023-09-05 | TikTok | STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE | 441 |
1 | 2023-06-27 | TikTok | STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE | 91 |
2 | 2023-09-24 | TikTok | STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH | 63 |
3 | 2023-04-30 | TikTok | STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE | 25 |
4 | 2023-07-05 | TikTok | STATEMENT_CATEGORY_SCAMS_AND_FRAUD | 1 |
Count the number of content_type
image entries for each platform
df_pd.groupby("platform_name")["count"].sum()
platform_name
Google Maps 6984
Snapchat 676
TikTok 48221
Name: count, dtype: int64