Example usage notebook
import dsa_tdb
from dsa_tdb import TDB_DataFrame
/usr/local/spark/python/pyspark/sql/udf.py:134: UserWarning: Cannot infer the eval type from type hints.
warnings.warn("Cannot infer the eval type from type hints. ", UserWarning)
Create a Spark session
spark = dsa_tdb.utils.spark_session_factory()
Load the data
df = TDB_DataFrame(spark=spark)
df.loadData(root_folder="../data/sample_data", platform="global", version="full")
df.head(1)
[Row(uuid='593e02be-69f1-45b9-a387-a7be6258b781', decision_visibility='["DECISION_VISIBILITY_CONTENT_REMOVED"]', decision_visibility_other=None, end_date_visibility_restriction=None, decision_monetary=None, decision_monetary_other=None, end_date_monetary_restriction=None, decision_provision=None, end_date_service_restriction=None, decision_account=None, end_date_account_restriction=None, account_type=None, decision_ground='DECISION_GROUND_INCOMPATIBLE_CONTENT', decision_ground_reference_url=None, illegal_content_legal_ground=None, illegal_content_explanation=None, incompatible_content_ground='Adult content', incompatible_content_explanation='This Pin violates our Community Guidelines on adult content, because it contains sexualization of clothed individuals.', incompatible_content_illegal=None, category='STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT', category_addition=None, category_specification='["KEYWORD_ADULT_SEXUAL_MATERIAL"]', category_specification_other=None, content_type='["CONTENT_TYPE_OTHER"]', content_type_other='Pin', content_language=None, content_date=datetime.datetime(2023, 9, 20, 0, 0), territorial_scope='EEA', application_date=datetime.datetime(2023, 9, 25, 0, 0), decision_facts='We identified this violation through our own investigation. This Pin violates our Community Guidelines on adult content, because it contains sexualization of clothed individuals.', source_type='SOURCE_VOLUNTARY', source_identity=None, automated_detection='No', automated_decision='AUTOMATED_DECISION_PARTIALLY', platform_name='Pinterest', platform_uid='24b09ca5-8c60-4eec-99c3-33a8fad3c4d2', created_at=datetime.datetime(2023, 9, 25, 0, 0))]
Perform a filter
# Keep only the content_type = 'CONTENT_TYPE_IMAGE'
df.filter_SoRs(content_type=["CONTENT_TYPE_IMAGE"])
df.head(1)
[Row(KEYWORD_TRADE_SECRET_INFRINGEMENT=False, KEYWORD_GEOGRAPHIC_INDICATIONS_INFRINGEMENT=False, incompatible_content_ground='Community Guidelines', KEYWORD_NON_CONSENSUAL_IMAGE_SHARING=False, KEYWORD_ONLINE_BULLYING_INTIMIDATION=False, automated_decision='AUTOMATED_DECISION_FULLY', end_date_service_restriction=None, KEYWORD_GOODS_SERVICES_NOT_PERMITTED=False, CONTENT_TYPE_TEXT=False, KEYWORD_UNLAWFUL_SALE_ANIMALS=False, KEYWORD_DATA_FALSIFICATION=False, territorial_scope='EEA_no_IS', KEYWORD_DISCRIMINATION=False, incompatible_content_explanation='Your content is against our Community Guidelines. The guidelines include rules and standards for using TikTok that apply to everyone and everything on our platform.We proactively enforce our Community Guidelines through a mix of technology and human moderation. We have detected this policy violation using automated measures. We have used automated measures in making this decision.', category_specification=None, content_language=None, content_type_other=None, STATEMENT_CATEGORY_RISK_FOR_PUBLIC_SECURITY=False, KEYWORD_AGE_SPECIFIC_RESTRICTIONS_MINORS=False, automated_detection='Yes', DECISION_VISIBILITY_CONTENT_DEMOTED=False, KEYWORD_RIGHT_TO_BE_FORGOTTEN=False, STATEMENT_CATEGORY_ANIMAL_WELFARE=False, KEYWORD_TERRORIST_CONTENT=False, KEYWORD_RISK_ENVIRONMENTAL_DAMAGE=False, KEYWORD_HATE_SPEECH=False, KEYWORD_OTHER=False, KEYWORD_PHISHING=False, CONTENT_TYPE_SYNTHETIC_MEDIA=False, KEYWORD_GENDER_BASED_VIOLENCE=False, uuid='974b0502-c3cc-4ae0-8592-16a721a44e95', content_type='["CONTENT_TYPE_IMAGE"]', category_specification_other=None, STATEMENT_CATEGORY_UNSAFE_AND_ILLEGAL_PRODUCTS=False, KEYWORD_PATENT_INFRINGEMENT=False, STATEMENT_CATEGORY_PROTECTION_OF_MINORS=False, KEYWORD_TRADEMARK_INFRINGEMENT=False, CONTENT_TYPE_OTHER=False, DECISION_VISIBILITY_CONTENT_DISABLED=False, KEYWORD_BIOMETRIC_DATA_BREACH=False, STATEMENT_CATEGORY_VIOLENCE=False, KEYWORD_IMAGE_BASED_SEXUAL_ABUSE=False, KEYWORD_NON_CONSENSUAL_ITEMS_DEEPFAKE=False, illegal_content_explanation=None, KEYWORD_ILLEGAL_ORGANIZATIONS=False, platform_uid='7282762331992431392', STATEMENT_CATEGORY_SELF_HARM=False, KEYWORD_INAUTHENTIC_USER_REVIEWS=False, KEYWORD_DESIGN_INFRINGEMENT=False, KEYWORD_INSUFFICIENT_INFORMATION_TRADERS=False, KEYWORD_SELF_MUTILATION=False, KEYWORD_INCITEMENT_VIOLENCE_HATRED=False, decision_account=None, decision_visibility_other=None, KEYWORD_INAUTHENTIC_LISTINGS=False, STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT=False, KEYWORD_ADULT_SEXUAL_MATERIAL=False, KEYWORD_MISINFORMATION=False, KEYWORD_NUDITY=False, account_type=None, decision_visibility='["DECISION_VISIBILITY_CONTENT_REMOVED"]', category='STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE', STATEMENT_CATEGORY_NEGATIVE_EFFECTS_ON_CIVIC_DISCOURSE_OR_ELECTIONS=False, KEYWORD_MISSING_PROCESSING_GROUND=False, source_type='SOURCE_VOLUNTARY', content_date=datetime.datetime(2023, 9, 7, 0, 0), KEYWORD_ANIMAL_HARM=False, KEYWORD_LANGUAGE_REQUIREMENTS=False, decision_ground_reference_url=None, KEYWORD_AGE_SPECIFIC_RESTRICTIONS=False, STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE=False, decision_monetary=None, KEYWORD_DISINFORMATION=False, end_date_account_restriction=None, KEYWORD_HUMAN_TRAFFICKING=False, illegal_content_legal_ground=None, STATEMENT_CATEGORY_INTELLECTUAL_PROPERTY_INFRINGEMENTS=False, platform_name='TikTok', CONTENT_TYPE_APP=False, KEYWORD_RISK_PUBLIC_HEALTH=False, CONTENT_TYPE_IMAGE=True, KEYWORD_COPYRIGHT_INFRINGEMENT=False, KEYWORD_STALKING=False, CONTENT_TYPE_PRODUCT=False, KEYWORD_PYRAMID_SCHEMES=False, application_date=datetime.datetime(2023, 9, 25, 0, 0), category_addition=None, STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH=False, decision_provision=None, KEYWORD_COORDINATED_HARM=False, KEYWORD_DANGEROUS_TOYS=False, KEYWORD_FOREIGN_INFORMATION_MANIPULATION=False, decision_ground='DECISION_GROUND_INCOMPATIBLE_CONTENT', KEYWORD_INAUTHENTIC_ACCOUNTS=False, KEYWORD_CONTENT_PROMOTING_EATING_DISORDERS=False, decision_monetary_other=None, KEYWORD_UNSAFE_CHALLENGES=False, KEYWORD_DEFAMATION=False, DECISION_VISIBILITY_CONTENT_REMOVED=True, KEYWORD_CHILD_SEXUAL_ABUSE_MATERIAL=False, DECISION_VISIBILITY_CONTENT_AGE_RESTRICTED=False, end_date_visibility_restriction=None, KEYWORD_HUMAN_EXPLOITATION=False, KEYWORD_REGULATED_GOODS_SERVICES=False, incompatible_content_illegal=None, DECISION_VISIBILITY_OTHER=False, CONTENT_TYPE_VIDEO=False, CONTENT_TYPE_AUDIO=False, STATEMENT_CATEGORY_NON_CONSENSUAL_BEHAVIOUR=False, DECISION_VISIBILITY_CONTENT_LABELLED=False, STATEMENT_CATEGORY_DATA_PROTECTION_AND_PRIVACY_VIOLATIONS=False, decision_facts='The decision was taken pursuant to own-initiative investigations.', KEYWORD_GEOGRAPHICAL_REQUIREMENTS=False, STATEMENT_CATEGORY_SCAMS_AND_FRAUD=False, KEYWORD_IMPERSONATION_ACCOUNT_HIJACKING=False, KEYWORD_SUICIDE=False, end_date_monetary_restriction=None, DECISION_VISIBILITY_CONTENT_INTERACTION_RESTRICTED=False, source_identity=None, KEYWORD_GROOMING_SEXUAL_ENTICEMENT_MINORS=False, created_at=datetime.datetime(2023, 9, 25, 0, 0))]
df.columns
['KEYWORD_TRADE_SECRET_INFRINGEMENT',
'KEYWORD_GEOGRAPHIC_INDICATIONS_INFRINGEMENT',
'incompatible_content_ground',
'KEYWORD_NON_CONSENSUAL_IMAGE_SHARING',
'KEYWORD_ONLINE_BULLYING_INTIMIDATION',
'automated_decision',
'end_date_service_restriction',
'KEYWORD_GOODS_SERVICES_NOT_PERMITTED',
'CONTENT_TYPE_TEXT',
'KEYWORD_UNLAWFUL_SALE_ANIMALS',
'KEYWORD_DATA_FALSIFICATION',
'territorial_scope',
'KEYWORD_DISCRIMINATION',
'incompatible_content_explanation',
'category_specification',
'content_language',
'content_type_other',
'STATEMENT_CATEGORY_RISK_FOR_PUBLIC_SECURITY',
'KEYWORD_AGE_SPECIFIC_RESTRICTIONS_MINORS',
'automated_detection',
'DECISION_VISIBILITY_CONTENT_DEMOTED',
'KEYWORD_RIGHT_TO_BE_FORGOTTEN',
'STATEMENT_CATEGORY_ANIMAL_WELFARE',
'KEYWORD_TERRORIST_CONTENT',
'KEYWORD_RISK_ENVIRONMENTAL_DAMAGE',
'KEYWORD_HATE_SPEECH',
'KEYWORD_OTHER',
'KEYWORD_PHISHING',
'CONTENT_TYPE_SYNTHETIC_MEDIA',
'KEYWORD_GENDER_BASED_VIOLENCE',
'uuid',
'content_type',
'category_specification_other',
'STATEMENT_CATEGORY_UNSAFE_AND_ILLEGAL_PRODUCTS',
'KEYWORD_PATENT_INFRINGEMENT',
'STATEMENT_CATEGORY_PROTECTION_OF_MINORS',
'KEYWORD_TRADEMARK_INFRINGEMENT',
'CONTENT_TYPE_OTHER',
'DECISION_VISIBILITY_CONTENT_DISABLED',
'KEYWORD_BIOMETRIC_DATA_BREACH',
'STATEMENT_CATEGORY_VIOLENCE',
'KEYWORD_IMAGE_BASED_SEXUAL_ABUSE',
'KEYWORD_NON_CONSENSUAL_ITEMS_DEEPFAKE',
'illegal_content_explanation',
'KEYWORD_ILLEGAL_ORGANIZATIONS',
'platform_uid',
'STATEMENT_CATEGORY_SELF_HARM',
'KEYWORD_INAUTHENTIC_USER_REVIEWS',
'KEYWORD_DESIGN_INFRINGEMENT',
'KEYWORD_INSUFFICIENT_INFORMATION_TRADERS',
'KEYWORD_SELF_MUTILATION',
'KEYWORD_INCITEMENT_VIOLENCE_HATRED',
'decision_account',
'decision_visibility_other',
'KEYWORD_INAUTHENTIC_LISTINGS',
'STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT',
'KEYWORD_ADULT_SEXUAL_MATERIAL',
'KEYWORD_MISINFORMATION',
'KEYWORD_NUDITY',
'account_type',
'decision_visibility',
'category',
'STATEMENT_CATEGORY_NEGATIVE_EFFECTS_ON_CIVIC_DISCOURSE_OR_ELECTIONS',
'KEYWORD_MISSING_PROCESSING_GROUND',
'source_type',
'content_date',
'KEYWORD_ANIMAL_HARM',
'KEYWORD_LANGUAGE_REQUIREMENTS',
'decision_ground_reference_url',
'KEYWORD_AGE_SPECIFIC_RESTRICTIONS',
'STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE',
'decision_monetary',
'KEYWORD_DISINFORMATION',
'end_date_account_restriction',
'KEYWORD_HUMAN_TRAFFICKING',
'illegal_content_legal_ground',
'STATEMENT_CATEGORY_INTELLECTUAL_PROPERTY_INFRINGEMENTS',
'platform_name',
'CONTENT_TYPE_APP',
'KEYWORD_RISK_PUBLIC_HEALTH',
'CONTENT_TYPE_IMAGE',
'KEYWORD_COPYRIGHT_INFRINGEMENT',
'KEYWORD_STALKING',
'CONTENT_TYPE_PRODUCT',
'KEYWORD_PYRAMID_SCHEMES',
'application_date',
'category_addition',
'STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH',
'decision_provision',
'KEYWORD_COORDINATED_HARM',
'KEYWORD_DANGEROUS_TOYS',
'KEYWORD_FOREIGN_INFORMATION_MANIPULATION',
'decision_ground',
'KEYWORD_INAUTHENTIC_ACCOUNTS',
'KEYWORD_CONTENT_PROMOTING_EATING_DISORDERS',
'decision_monetary_other',
'KEYWORD_UNSAFE_CHALLENGES',
'KEYWORD_DEFAMATION',
'DECISION_VISIBILITY_CONTENT_REMOVED',
'KEYWORD_CHILD_SEXUAL_ABUSE_MATERIAL',
'DECISION_VISIBILITY_CONTENT_AGE_RESTRICTED',
'end_date_visibility_restriction',
'KEYWORD_HUMAN_EXPLOITATION',
'KEYWORD_REGULATED_GOODS_SERVICES',
'incompatible_content_illegal',
'DECISION_VISIBILITY_OTHER',
'CONTENT_TYPE_VIDEO',
'CONTENT_TYPE_AUDIO',
'STATEMENT_CATEGORY_NON_CONSENSUAL_BEHAVIOUR',
'DECISION_VISIBILITY_CONTENT_LABELLED',
'STATEMENT_CATEGORY_DATA_PROTECTION_AND_PRIVACY_VIOLATIONS',
'decision_facts',
'KEYWORD_GEOGRAPHICAL_REQUIREMENTS',
'STATEMENT_CATEGORY_SCAMS_AND_FRAUD',
'KEYWORD_IMPERSONATION_ACCOUNT_HIJACKING',
'KEYWORD_SUICIDE',
'end_date_monetary_restriction',
'DECISION_VISIBILITY_CONTENT_INTERACTION_RESTRICTED',
'source_identity',
'KEYWORD_GROOMING_SEXUAL_ENTICEMENT_MINORS',
'created_at']
Aggregate by content_date, platform_name and category
df.aggregate_SoRs(columns_to_group=["content_date", "platform_name", "category"])
# Port to pandas
df_pd = df.toPandas()
df_pd.head(5)
| content_date | platform_name | category | count | |
|---|---|---|---|---|
| 0 | 2023-09-05 | TikTok | STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE | 441 |
| 1 | 2023-06-27 | TikTok | STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE | 91 |
| 2 | 2023-09-24 | TikTok | STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH | 63 |
| 3 | 2023-04-30 | TikTok | STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE | 25 |
| 4 | 2023-07-05 | TikTok | STATEMENT_CATEGORY_SCAMS_AND_FRAUD | 1 |
Count the number of content_type image entries for each platform
df_pd.groupby("platform_name")["count"].sum()
platform_name
Google Maps 6984
Snapchat 676
TikTok 48221
Name: count, dtype: int64